26 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #define DEBUG_TYPE "amdgpu-legalinfo" 31 using namespace LegalizeActions;
32 using namespace LegalizeMutations;
33 using namespace LegalityPredicates;
34 using namespace MIPatternMatch;
38 "amdgpu-global-isel-new-legality",
39 cl::desc(
"Use GlobalISel desired legality, rather than try to use" 40 "rules compatible with selection patterns"),
65 const LLT Ty = Query.Types[TypeIdx];
72 EltSize > 1 && EltSize < 32 &&
79 const LLT Ty = Query.Types[TypeIdx];
86 const LLT Ty = Query.Types[TypeIdx];
94 const LLT Ty = Query.Types[TypeIdx];
102 const LLT Ty = Query.Types[TypeIdx];
105 unsigned Pieces = (
Size + 63) / 64;
115 const LLT Ty = Query.Types[TypeIdx];
120 const int NextMul32 = (
Size + 31) / 32;
124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125 return std::make_pair(TypeIdx,
LLT::vector(NewNumElts, EltTy));
144 const LLT Ty = Query.Types[TypeIdx];
151 const LLT Ty = Query.Types[TypeIdx];
160 const LLT QueryTy = Query.Types[TypeIdx];
167 const LLT QueryTy = Query.Types[TypeIdx];
174 const LLT QueryTy = Query.Types[TypeIdx];
185 return EltSize == 16 || EltSize % 32 == 0;
190 return EltSize == 32 || EltSize == 64 ||
192 EltSize == 128 || EltSize == 256;
215 const LLT QueryTy = Query.Types[TypeIdx];
225 const LLT Ty = Query.Types[TypeIdx];
239 return ST.enableFlatScratch() ? 128 : 32;
241 return ST.useDS128() ? 128 : 64;
251 return IsLoad ? 512 : 128;
265 const bool IsLoad = Opcode != AMDGPU::G_STORE;
268 unsigned MemSize = Query.
MMODescrs[0].SizeInBits;
269 unsigned AlignBits = Query.
MMODescrs[0].AlignInBits;
270 unsigned AS = Query.
Types[1].getAddressSpace();
280 if (IsLoad && MemSize <
Size)
285 if (MemSize != RegSize && RegSize != 32)
299 if (!
ST.hasDwordx3LoadStores())
310 assert(RegSize >= MemSize);
312 if (AlignBits < MemSize) {
315 Align(AlignBits / 8)))
341 return EltSize != 32 && EltSize != 64;
354 const unsigned MemSizeInBits) {
356 if (
Size != MemSizeInBits)
369 unsigned AlignInBits,
unsigned AddrSpace,
378 if (SizeInBits == 96 &&
ST.hasDwordx3LoadStores())
389 if (AlignInBits < RoundedSize)
396 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
408 Query.
Types[1].getAddressSpace(), Opcode);
414 using namespace TargetOpcode;
416 auto GetAddrSpacePtr = [&
TM](
unsigned AS) {
460 std::initializer_list<LLT> AllS32Vectors =
461 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
462 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
463 std::initializer_list<LLT> AllS64Vectors =
464 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
474 const LLT CodePtr = FlatPtr;
476 const std::initializer_list<LLT> AddrSpaces64 = {
477 GlobalPtr, ConstantPtr, FlatPtr
480 const std::initializer_list<LLT> AddrSpaces32 = {
481 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
484 const std::initializer_list<LLT> FPTypesBase = {
488 const std::initializer_list<LLT> FPTypes16 = {
492 const std::initializer_list<LLT> FPTypesPK16 = {
496 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
504 .
legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
505 .legalFor(AllS32Vectors)
516 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
519 .legalFor({S32, S16, V2S16})
520 .clampScalar(0, S16, S32)
526 .legalFor({S32, S16, V2S16})
527 .minScalarOrElt(0, S16)
532 }
else if (ST.has16BitInsts()) {
534 .legalFor({S32, S16})
535 .clampScalar(0, S16, S32)
542 .legalFor({S32, S16})
557 .clampScalar(0, S32, S32)
560 if (ST.hasIntClamp()) {
564 .minScalarOrElt(0, S32)
583 .customFor({S32, S64})
584 .clampScalar(0, S32, S64)
590 .maxScalarOrElt(0, S32);
592 if (ST.hasVOP3PInsts()) {
594 .clampMaxNumElements(0, S8, 2)
605 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
606 .clampScalar(0, S32, S64)
613 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
614 .legalFor({{S32, S1}, {S32, S32}})
626 .
legalFor({S1, S32, S64, S16, GlobalPtr,
627 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
634 .clampScalar(0, S16, S64);
659 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
660 .legalFor({S32, S64});
662 .customFor({S32, S64});
666 if (ST.has16BitInsts()) {
667 if (ST.hasVOP3PInsts())
677 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
679 if (ST.hasVOP3PInsts()) {
685 }
else if (ST.has16BitInsts()) {
695 if (ST.hasVOP3PInsts())
700 .
clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
704 .
clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
708 .
clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
711 .legalFor(FPTypesPK16)
716 if (ST.has16BitInsts()) {
718 .legalFor({S32, S64, S16})
720 .clampScalar(0, S16, S64);
725 .clampScalar(0, S32, S64);
727 if (ST.hasFractBug()) {
730 .legalFor({S32, S64})
732 .clampScalar(0, S32, S64);
737 .clampScalar(0, S32, S64);
748 .narrowScalarFor({{S64, S16}},
changeTo(0, S32))
755 .lowerFor({S64, S16, V2S16})
757 .clampScalar(0, S32, S64);
761 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
762 FMad.customFor({S32, S16});
763 else if (ST.hasMadMacF32Insts())
764 FMad.customFor({S32});
765 else if (ST.hasMadF16())
766 FMad.customFor({S16});
771 if (ST.has16BitInsts()) {
772 FRem.customFor({S16, S32, S64});
774 FRem.minScalar(0, S32)
775 .customFor({S32, S64});
783 .clampMaxNumElements(0, S16, 2)
791 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
792 {S32, S1}, {S64, S1}, {S16, S1}})
794 .clampScalar(0, S32, S64)
795 .widenScalarToNextPow2(1, 32);
799 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
800 .lowerFor({{S32, S64}})
803 if (ST.has16BitInsts())
811 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
812 .customFor({{S64, S64}})
813 .narrowScalarFor({{S64, S16}},
changeTo(0, S32));
824 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
828 if (
ST.has16BitInsts()) {
829 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
830 .legalFor({S16, S32, S64})
831 .clampScalar(0, S16, S64)
834 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
835 .legalFor({S32, S64})
836 .clampScalar(0, S32, S64)
839 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
842 .clampScalar(0, S32, S64)
846 getActionDefinitionsBuilder(G_PTR_ADD)
849 .scalarSameSizeAs(1, 0);
851 getActionDefinitionsBuilder(G_PTRMASK)
853 .scalarSameSizeAs(1, 0)
857 getActionDefinitionsBuilder(G_ICMP)
868 .legalForCartesianProduct(
869 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
870 .legalForCartesianProduct(
871 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
872 if (
ST.has16BitInsts()) {
873 CmpBuilder.legalFor({{S1, S16}});
877 .widenScalarToNextPow2(1)
878 .clampScalar(1, S32, S64)
882 getActionDefinitionsBuilder(G_FCMP)
883 .legalForCartesianProduct({S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
884 .widenScalarToNextPow2(1)
885 .clampScalar(1, S32, S64)
889 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
890 if (
ST.has16BitInsts())
891 Exp2Ops.legalFor({S32, S16});
893 Exp2Ops.legalFor({S32});
894 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
895 Exp2Ops.scalarize(0);
897 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
898 if (
ST.has16BitInsts())
899 ExpOps.customFor({{S32}, {S16}});
901 ExpOps.customFor({S32});
902 ExpOps.clampScalar(0, MinScalarFPTy, S32)
905 getActionDefinitionsBuilder(G_FPOWI)
906 .clampScalar(0, MinScalarFPTy, S32)
910 getActionDefinitionsBuilder(G_CTPOP)
911 .legalFor({{S32, S32}, {S32, S64}})
912 .clampScalar(0, S32, S32)
913 .clampScalar(1, S32, S64)
915 .widenScalarToNextPow2(0, 32)
916 .widenScalarToNextPow2(1, 32);
921 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
923 .clampScalar(0, S32, S32)
924 .clampScalar(1, S32, S64)
925 .widenScalarToNextPow2(0, 32)
926 .widenScalarToNextPow2(1, 32)
930 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
931 .legalFor({{S32, S32}, {S32, S64}})
932 .clampScalar(0, S32, S32)
933 .clampScalar(1, S32, S64)
935 .widenScalarToNextPow2(0, 32)
936 .widenScalarToNextPow2(1, 32);
938 getActionDefinitionsBuilder(G_BITREVERSE)
940 .clampScalar(0, S32, S32)
943 if (
ST.has16BitInsts()) {
944 getActionDefinitionsBuilder(G_BSWAP)
945 .legalFor({S16, S32, V2S16})
946 .clampMaxNumElements(0, S16, 2)
949 .widenScalarToNextPow2(0)
950 .clampScalar(0, S16, S32)
953 if (
ST.hasVOP3PInsts()) {
954 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
955 .legalFor({S32, S16, V2S16})
957 .clampMaxNumElements(0, S16, 2)
959 .widenScalarToNextPow2(0)
963 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
964 .legalFor({S32, S16})
965 .widenScalarToNextPow2(0)
972 getActionDefinitionsBuilder(G_BSWAP)
977 .widenScalarToNextPow2(0)
982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
985 .widenScalarToNextPow2(0)
990 getActionDefinitionsBuilder(G_INTTOPTR)
992 .legalForCartesianProduct(AddrSpaces64, {S64})
993 .legalForCartesianProduct(AddrSpaces32, {S32})
999 return std::make_pair(1, LLT::scalar(Query.
Types[0].getSizeInBits()));
1003 return std::make_pair(1, LLT::scalar(Query.
Types[0].getSizeInBits()));
1006 getActionDefinitionsBuilder(G_PTRTOINT)
1008 .legalForCartesianProduct(AddrSpaces64, {S64})
1009 .legalForCartesianProduct(AddrSpaces32, {S32})
1015 return std::make_pair(0, LLT::scalar(Query.
Types[1].getSizeInBits()));
1020 return std::make_pair(0, LLT::scalar(Query.
Types[1].getSizeInBits()));
1023 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1027 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1028 bool IsLoad) ->
bool {
1032 unsigned MemSize = Query.
MMODescrs[0].SizeInBits;
1033 unsigned AlignBits = Query.
MMODescrs[0].AlignInBits;
1036 MemSize =
std::max(MemSize, AlignBits);
1048 unsigned NumRegs = (MemSize + 31) / 32;
1050 if (!
ST.hasDwordx3LoadStores())
1058 if (AlignBits < MemSize) {
1061 Align(AlignBits / 8));
1067 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1068 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1069 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1075 for (
unsigned Op : {G_LOAD, G_STORE}) {
1076 const bool IsStore =
Op == G_STORE;
1078 auto &Actions = getActionDefinitionsBuilder(
Op);
1081 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1082 {V2S32, GlobalPtr, 64, GlobalAlign32},
1083 {V4S32, GlobalPtr, 128, GlobalAlign32},
1084 {S64, GlobalPtr, 64, GlobalAlign32},
1085 {V2S64, GlobalPtr, 128, GlobalAlign32},
1086 {V2S16, GlobalPtr, 32, GlobalAlign32},
1087 {S32, GlobalPtr, 8, GlobalAlign8},
1088 {S32, GlobalPtr, 16, GlobalAlign16},
1090 {S32, LocalPtr, 32, 32},
1091 {S64, LocalPtr, 64, 32},
1092 {V2S32, LocalPtr, 64, 32},
1093 {S32, LocalPtr, 8, 8},
1094 {S32, LocalPtr, 16, 16},
1095 {V2S16, LocalPtr, 32, 32},
1097 {S32, PrivatePtr, 32, 32},
1098 {S32, PrivatePtr, 8, 8},
1099 {S32, PrivatePtr, 16, 16},
1100 {V2S16, PrivatePtr, 32, 32},
1102 {S32, ConstantPtr, 32, GlobalAlign32},
1103 {V2S32, ConstantPtr, 64, GlobalAlign32},
1104 {V4S32, ConstantPtr, 128, GlobalAlign32},
1105 {S64, ConstantPtr, 64, GlobalAlign32},
1106 {V2S32, ConstantPtr, 32, GlobalAlign32}});
1117 Actions.customIf(
typeIs(1, Constant32Ptr));
1143 return !Query.
Types[0].isVector() &&
1144 needToSplitMemOp(Query,
Op == G_LOAD);
1146 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1151 unsigned MemSize = Query.
MMODescrs[0].SizeInBits;
1154 if (DstSize > MemSize)
1155 return std::make_pair(0, LLT::scalar(MemSize));
1162 return std::make_pair(0, LLT::scalar(FloorSize));
1165 if (DstSize > 32 && (DstSize % 32 != 0)) {
1168 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1174 if (MemSize > MaxSize)
1175 return std::make_pair(0, LLT::scalar(MaxSize));
1178 return std::make_pair(0, LLT::scalar(
Align));
1182 return Query.
Types[0].isVector() &&
1183 needToSplitMemOp(Query,
Op == G_LOAD);
1185 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1199 if (Query.
MMODescrs[0].SizeInBits > MaxSize) {
1203 if (MaxSize % EltSize == 0) {
1204 return std::make_pair(
1205 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1208 unsigned NumPieces = Query.
MMODescrs[0].SizeInBits / MaxSize;
1212 if (NumPieces == 1 || NumPieces >= NumElts ||
1213 NumElts % NumPieces != 0)
1214 return std::make_pair(0, EltTy);
1216 return std::make_pair(0,
1221 unsigned MemSize = Query.
MMODescrs[0].SizeInBits;
1223 return std::make_pair(0, EltTy);
1232 return std::make_pair(
1233 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1238 if (EltSize >
Align &&
1244 return std::make_pair(0, EltTy);
1246 .lowerIfMemSizeNotPow2()
1253 .widenScalarToNextPow2(0)
1258 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1259 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1260 {S32, GlobalPtr, 16, 2 * 8},
1261 {S32, LocalPtr, 8, 8},
1262 {S32, LocalPtr, 16, 16},
1263 {S32, PrivatePtr, 8, 8},
1264 {S32, PrivatePtr, 16, 16},
1265 {S32, ConstantPtr, 8, 8},
1266 {S32, ConstantPtr, 16, 2 * 8}});
1267 if (
ST.hasFlatAddressSpace()) {
1268 ExtLoads.legalForTypesWithMemDesc(
1269 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1272 ExtLoads.clampScalar(0, S32, S32)
1273 .widenScalarToNextPow2(0)
1274 .unsupportedIfMemSizeNotPow2()
1277 auto &Atomics = getActionDefinitionsBuilder(
1278 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1282 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283 {S64, GlobalPtr}, {S64, LocalPtr},
1284 {S32, RegionPtr}, {S64, RegionPtr}});
1285 if (
ST.hasFlatAddressSpace()) {
1286 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1289 if (
ST.hasLDSFPAtomics()) {
1290 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1291 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1296 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1297 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1298 {S32, FlatPtr}, {S64, FlatPtr}})
1299 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1300 {S32, RegionPtr}, {S64, RegionPtr}});
1304 getActionDefinitionsBuilder(G_SELECT)
1305 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1306 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1308 .clampScalar(0, S16, S64)
1312 .clampMaxNumElements(0, S32, 2)
1313 .clampMaxNumElements(0, LocalPtr, 2)
1314 .clampMaxNumElements(0, PrivatePtr, 2)
1316 .widenScalarToNextPow2(0)
1321 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1322 .legalFor({{S32, S32}, {S64, S32}});
1323 if (
ST.has16BitInsts()) {
1324 if (
ST.hasVOP3PInsts()) {
1325 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1326 .clampMaxNumElements(0, S16, 2);
1328 Shifts.legalFor({{S16, S16}});
1331 Shifts.widenScalarIf(
1336 const LLT AmountTy = Query.
Types[1];
1340 Shifts.maxScalarIf(
typeIs(0, S16), 1, S16);
1341 Shifts.clampScalar(1, S32, S32);
1342 Shifts.clampScalar(0, S16, S64);
1343 Shifts.widenScalarToNextPow2(0, 16);
1345 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1353 Shifts.clampScalar(1, S32, S32);
1354 Shifts.clampScalar(0, S32, S64);
1355 Shifts.widenScalarToNextPow2(0, 32);
1357 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1362 Shifts.scalarize(0);
1364 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1365 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1366 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1367 unsigned IdxTypeIdx = 2;
1369 getActionDefinitionsBuilder(
Op)
1371 const LLT EltTy = Query.
Types[EltTypeIdx];
1372 const LLT VecTy = Query.
Types[VecTypeIdx];
1373 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1375 return (EltSize == 32 || EltSize == 64) &&
1389 const LLT EltTy = Query.
Types[EltTypeIdx];
1390 const LLT VecTy = Query.
Types[VecTypeIdx];
1394 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1395 return std::make_pair(
1396 VecTypeIdx,
LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1398 .clampScalar(EltTypeIdx, S32, S64)
1399 .clampScalar(VecTypeIdx, S32, S64)
1400 .clampScalar(IdxTypeIdx, S32, S32)
1401 .clampMaxNumElements(VecTypeIdx, S32, 32)
1408 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1410 const LLT &EltTy = Query.
Types[1].getElementType();
1411 return Query.
Types[0] != EltTy;
1414 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1415 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1416 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1419 getActionDefinitionsBuilder(
Op)
1423 const LLT BigTy = Query.
Types[BigTyIdx];
1424 const LLT LitTy = Query.
Types[LitTyIdx];
1430 const LLT BigTy = Query.
Types[BigTyIdx];
1436 const LLT LitTy = Query.
Types[LitTyIdx];
1441 .widenScalarToNextPow2(BigTyIdx, 32);
1445 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1446 .legalForCartesianProduct(AllS32Vectors, {S32})
1447 .legalForCartesianProduct(AllS64Vectors, {S64})
1448 .clampNumElements(0, V16S32, V32S32)
1449 .clampNumElements(0, V2S64, V16S64)
1452 if (
ST.hasScalarPackInsts()) {
1455 .minScalarOrElt(0, S16)
1459 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1460 .legalFor({V2S16, S32})
1462 BuildVector.minScalarOrElt(0, S32);
1464 BuildVector.customFor({V2S16, S16});
1465 BuildVector.minScalarOrElt(0, S32);
1467 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1468 .customFor({V2S16, S32})
1475 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1477 .clampMaxNumElements(0, S32, 32)
1478 .clampMaxNumElements(1, S16, 2)
1479 .clampMaxNumElements(0, S16, 64);
1483 if (
ST.hasVOP3PInsts()) {
1484 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1485 .customFor({V2S16, V2S16})
1488 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1491 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1492 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1493 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1495 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1496 const LLT Ty = Query.
Types[TypeIdx];
1507 auto &
Builder = getActionDefinitionsBuilder(
Op)
1509 .lowerFor({{S16, V2S16}})
1511 const LLT BigTy = Query.
Types[BigTyIdx];
1517 .widenScalarToNextPow2(LitTyIdx, 16)
1525 .clampScalar(LitTyIdx, S32, S512)
1526 .widenScalarToNextPow2(LitTyIdx, 32)
1529 [=](
const LegalityQuery &Query) {
return notValidElt(Query, LitTyIdx); },
1532 [=](
const LegalityQuery &Query) {
return notValidElt(Query, BigTyIdx); },
1534 .clampScalar(BigTyIdx, S32, MaxScalar);
1536 if (
Op == G_MERGE_VALUES) {
1540 const LLT Ty = Query.
Types[LitTyIdx];
1548 const LLT Ty = Query.
Types[BigTyIdx];
1555 const LLT &Ty = Query.
Types[BigTyIdx];
1557 if (NewSizeInBits >= 256) {
1559 if (RoundedTo < NewSizeInBits)
1560 NewSizeInBits = RoundedTo;
1562 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1571 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1572 .legalFor({{S32}, {S64}});
1574 if (
ST.hasVOP3PInsts()) {
1575 SextInReg.lowerFor({{V2S16}})
1580 }
else if (
ST.has16BitInsts()) {
1581 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1585 SextInReg.lowerFor({{S32}, {S64}});
1590 .clampScalar(0, S32, S64)
1593 getActionDefinitionsBuilder(G_FSHR)
1594 .legalFor({{S32, S32}})
1598 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1601 getActionDefinitionsBuilder(G_FENCE)
1604 getActionDefinitionsBuilder({
1608 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1617 G_FMINIMUM, G_FMAXIMUM,
1621 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1622 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1623 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1635 switch (
MI.getOpcode()) {
1636 case TargetOpcode::G_ADDRSPACE_CAST:
1638 case TargetOpcode::G_FRINT:
1640 case TargetOpcode::G_FCEIL:
1642 case TargetOpcode::G_FREM:
1644 case TargetOpcode::G_INTRINSIC_TRUNC:
1646 case TargetOpcode::G_SITOFP:
1648 case TargetOpcode::G_UITOFP:
1650 case TargetOpcode::G_FPTOSI:
1652 case TargetOpcode::G_FPTOUI:
1654 case TargetOpcode::G_FMINNUM:
1655 case TargetOpcode::G_FMAXNUM:
1656 case TargetOpcode::G_FMINNUM_IEEE:
1657 case TargetOpcode::G_FMAXNUM_IEEE:
1659 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1661 case TargetOpcode::G_INSERT_VECTOR_ELT:
1663 case TargetOpcode::G_SHUFFLE_VECTOR:
1665 case TargetOpcode::G_FSIN:
1666 case TargetOpcode::G_FCOS:
1668 case TargetOpcode::G_GLOBAL_VALUE:
1670 case TargetOpcode::G_LOAD:
1672 case TargetOpcode::G_FMAD:
1674 case TargetOpcode::G_FDIV:
1676 case TargetOpcode::G_UDIV:
1677 case TargetOpcode::G_UREM:
1679 case TargetOpcode::G_SDIV:
1680 case TargetOpcode::G_SREM:
1682 case TargetOpcode::G_ATOMIC_CMPXCHG:
1684 case TargetOpcode::G_FLOG:
1686 case TargetOpcode::G_FLOG10:
1688 case TargetOpcode::G_FEXP:
1690 case TargetOpcode::G_FPOW:
1692 case TargetOpcode::G_FFLOOR:
1694 case TargetOpcode::G_BUILD_VECTOR:
1713 if (ST.hasApertureRegs()) {
1727 Register GetReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1729 B.buildInstr(AMDGPU::S_GETREG_B32)
1732 MRI.setType(GetReg, S32);
1734 auto ShiftAmt =
B.buildConstant(S32, WidthM1 + 1);
1735 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1738 Register QueuePtr =
MRI.createGenericVirtualRegister(
1758 B.materializePtrAdd(LoadAddr, QueuePtr,
LLT::scalar(64), StructOffset);
1759 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1771 LLT DstTy =
MRI.getType(Dst);
1772 LLT SrcTy =
MRI.getType(Src);
1781 = static_cast<const AMDGPUTargetMachine &>(MF.
getTarget());
1783 if (
TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1784 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
1790 B.buildExtract(Dst, Src, 0);
1791 MI.eraseFromParent();
1802 auto HighAddr =
B.buildConstant(
1804 B.buildMerge(Dst, {Src, HighAddr});
1805 MI.eraseFromParent();
1812 unsigned NullVal =
TM.getNullPointerValue(DestAS);
1814 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
1815 auto FlatNull =
B.buildConstant(SrcTy, 0);
1818 auto PtrLo32 =
B.buildExtract(DstTy, Src, 0);
1822 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1824 MI.eraseFromParent();
1835 B.buildConstant(SrcTy,
TM.getNullPointerValue(SrcAS));
1837 B.buildConstant(DstTy,
TM.getNullPointerValue(DestAS));
1847 Register SrcAsInt =
B.buildPtrToInt(S32, Src).getReg(0);
1851 auto BuildPtr =
B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1852 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1854 MI.eraseFromParent();
1862 LLT Ty =
MRI.getType(Src);
1868 auto C1 =
B.buildFConstant(Ty, C1Val);
1869 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
1872 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
1873 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
1875 auto C2 =
B.buildFConstant(Ty, C2Val);
1876 auto Fabs =
B.buildFAbs(Ty, Src);
1879 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
1880 MI.eraseFromParent();
1898 auto Trunc =
B.buildIntrinsicTrunc(S64, Src);
1900 const auto Zero =
B.buildFConstant(S64, 0.0);
1901 const auto One =
B.buildFConstant(S64, 1.0);
1904 auto And =
B.buildAnd(S1, Lt0, NeTrunc);
1905 auto Add =
B.buildSelect(S64,
And, One, Zero);
1908 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
1916 Register Src0Reg =
MI.getOperand(1).getReg();
1917 Register Src1Reg =
MI.getOperand(2).getReg();
1918 auto Flags =
MI.getFlags();
1919 LLT Ty =
MRI.getType(DstReg);
1921 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1922 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
1923 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
1924 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1925 MI.eraseFromParent();
1931 const unsigned FractBits = 52;
1932 const unsigned ExpBits = 11;
1935 auto Const0 =
B.buildConstant(S32, FractBits - 32);
1936 auto Const1 =
B.buildConstant(S32, ExpBits);
1938 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32},
false)
1940 .addUse(Const0.getReg(0))
1941 .addUse(Const1.getReg(0));
1943 return B.buildSub(S32, ExpPart,
B.buildConstant(S32, 1023));
1957 auto Unmerge =
B.buildUnmerge({S32, S32}, Src);
1964 const unsigned FractBits = 52;
1967 const auto SignBitMask =
B.buildConstant(S32, UINT32_C(1) << 31);
1968 auto SignBit =
B.buildAnd(S32,
Hi, SignBitMask);
1970 const auto FractMask =
B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1972 const auto Zero32 =
B.buildConstant(S32, 0);
1975 auto SignBit64 =
B.buildMerge(S64, {Zero32, SignBit});
1977 auto Shr =
B.buildAShr(S64, FractMask, Exp);
1978 auto Not =
B.buildNot(S64, Shr);
1979 auto Tmp0 =
B.buildAnd(S64, Src, Not);
1980 auto FiftyOne =
B.buildConstant(S32, FractBits - 1);
1985 auto Tmp1 =
B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1986 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1987 MI.eraseFromParent();
2001 assert(
MRI.getType(Src) == S64 &&
MRI.getType(Dst) == S64);
2003 auto Unmerge =
B.buildUnmerge({S32, S32}, Src);
2006 B.buildSITOFP(S64, Unmerge.getReg(1)) :
2007 B.buildUITOFP(S64, Unmerge.getReg(1));
2009 auto CvtLo =
B.buildUITOFP(S64, Unmerge.getReg(0));
2011 auto ThirtyTwo =
B.buildConstant(S32, 32);
2012 auto LdExp =
B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64},
false)
2013 .addUse(CvtHi.getReg(0))
2014 .addUse(ThirtyTwo.getReg(0));
2017 B.buildFAdd(Dst, LdExp, CvtLo);
2018 MI.eraseFromParent();
2034 assert(
MRI.getType(Src) == S64 &&
MRI.getType(Dst) == S64);
2036 unsigned Flags =
MI.getFlags();
2038 auto Trunc =
B.buildIntrinsicTrunc(S64, Src, Flags);
2039 auto K0 =
B.buildFConstant(S64,
BitsToDouble(UINT64_C(0x3df0000000000000)));
2040 auto K1 =
B.buildFConstant(S64,
BitsToDouble(UINT64_C(0xc1f0000000000000)));
2042 auto Mul =
B.buildFMul(S64, Trunc, K0, Flags);
2043 auto FloorMul =
B.buildFFloor(S64,
Mul, Flags);
2044 auto Fma =
B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2047 B.buildFPTOSI(S32, FloorMul) :
2048 B.buildFPTOUI(S32, FloorMul);
2049 auto Lo =
B.buildFPTOUI(S32, Fma);
2051 B.buildMerge(Dst, {
Lo,
Hi });
2052 MI.eraseFromParent();
2062 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2063 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2095 LLT VecTy =
MRI.getType(Vec);
2104 MI.eraseFromParent();
2128 LLT VecTy =
MRI.getType(Vec);
2137 MI.eraseFromParent();
2148 LLT DstTy =
MRI.getType(Dst);
2149 LLT SrcTy =
MRI.getType(Src0);
2151 if (SrcTy == V2S16 && DstTy == V2S16 &&
2167 LLT Ty =
MRI.getType(DstReg);
2168 unsigned Flags =
MI.getFlags();
2173 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2174 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty},
false)
2175 .addUse(MulVal.getReg(0))
2176 .setMIFlags(Flags).getReg(0);
2178 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2181 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2182 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg),
false)
2185 MI.eraseFromParent();
2193 unsigned GAFlags)
const {
2231 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2242 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2245 B.buildExtract(DstReg, PCReg, 0);
2253 LLT Ty =
MRI.getType(DstReg);
2264 Fn,
"local memory global used by non-kernel function",
MI.getDebugLoc(),
2274 B.buildUndef(DstReg);
2275 MI.eraseFromParent();
2294 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2299 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32},
false);
2300 B.buildIntToPtr(DstReg, Sz);
2301 MI.eraseFromParent();
2309 MI.eraseFromParent();
2315 Fn,
"unsupported initializer for address space",
MI.getDebugLoc());
2324 MI.eraseFromParent();
2330 MI.eraseFromParent();
2335 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
2347 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2348 B.buildExtract(DstReg,
Load, 0);
2350 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2352 MI.eraseFromParent();
2369 LLT PtrTy =
MRI.getType(PtrReg);
2374 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
2376 MI.getOperand(1).setReg(Cast.getReg(0));
2382 LLT ValTy =
MRI.getType(ValReg);
2386 const unsigned MemSize = 8 * MMO->
getSize();
2388 const unsigned AlignInBits = 8 * MemAlign.
value();
2396 if (WideMemSize == ValSize) {
2402 MI.setMemRefs(MF, {WideMMO});
2408 if (ValSize > WideMemSize)
2415 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2416 B.buildTrunc(ValReg, WideLoad).getReg(0);
2423 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2424 B.buildExtract(ValReg, WideLoad, 0);
2428 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
2430 B.setInsertPt(
B.getMBB(),
MI.getIterator());
2431 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2435 MI.eraseFromParent();
2445 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
2472 "this should not have been custom lowered");
2474 LLT ValTy =
MRI.getType(CmpVal);
2477 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
2479 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2483 .setMemRefs(
MI.memoperands());
2485 MI.eraseFromParent();
2493 LLT Ty =
B.getMRI()->getType(Dst);
2494 unsigned Flags =
MI.getFlags();
2496 auto Log2Operand =
B.buildFLog2(Ty, Src, Flags);
2497 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
2499 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2500 MI.eraseFromParent();
2508 unsigned Flags =
MI.getFlags();
2509 LLT Ty =
B.getMRI()->getType(Dst);
2512 auto Mul =
B.buildFMul(Ty, Src, K, Flags);
2513 B.buildFExp2(Dst,
Mul, Flags);
2514 MI.eraseFromParent();
2523 unsigned Flags =
MI.getFlags();
2524 LLT Ty =
B.getMRI()->getType(Dst);
2529 auto Log =
B.buildFLog2(S32, Src0, Flags);
2530 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32},
false)
2531 .addUse(Log.getReg(0))
2534 B.buildFExp2(Dst,
Mul, Flags);
2535 }
else if (Ty == S16) {
2537 auto Log =
B.buildFLog2(S16, Src0, Flags);
2538 auto Ext0 =
B.buildFPExt(S32, Log, Flags);
2539 auto Ext1 =
B.buildFPExt(S32, Src1, Flags);
2540 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32},
false)
2541 .addUse(Ext0.getReg(0))
2542 .addUse(Ext1.getReg(0))
2545 B.buildFExp2(Dst,
B.buildFPTrunc(S16,
Mul), Flags);
2549 MI.eraseFromParent();
2557 ModSrc = SrcFNeg->getOperand(1).getReg();
2559 ModSrc = SrcFAbs->getOperand(1).getReg();
2561 ModSrc = SrcFAbs->getOperand(1).getReg();
2572 Register OrigSrc =
MI.getOperand(1).getReg();
2573 unsigned Flags =
MI.getFlags();
2575 "this should not have been custom lowered");
2585 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64},
false)
2596 auto Const =
B.buildFConstant(S64,
BitsToDouble(0x3fefffffffffffff));
2598 Register Min =
MRI.createGenericVirtualRegister(S64);
2604 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2606 B.buildFMinNum(Min, Fract, Const, Flags);
2611 CorrectedFract =
B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2614 auto NegFract =
B.buildFNeg(S64, CorrectedFract, Flags);
2615 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2617 MI.eraseFromParent();
2633 auto Merge =
B.buildMerge(S32, {Src0, Src1});
2634 B.buildBitcast(Dst,
Merge);
2636 MI.eraseFromParent();
2642 if (
MI.getOpcode() != TargetOpcode::G_XOR)
2645 return ConstVal && *ConstVal == -1;
2652 Register CondDef =
MI.getOperand(0).getReg();
2653 if (!
MRI.hasOneNonDBGUse(CondDef))
2661 if (!
MRI.hasOneNonDBGUse(NegatedCond))
2667 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
2676 if (Next == Parent->
end()) {
2680 UncondBrTarget = &*NextMBB;
2682 if (Next->getOpcode() != AMDGPU::G_BR)
2701 if (
Arg->isMasked()) {
2704 const unsigned Mask =
Arg->getMask();
2705 const unsigned Shift = countTrailingZeros<unsigned>(
Mask);
2710 auto ShiftAmt =
B.buildConstant(S32, Shift);
2711 AndMaskSrc =
B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2714 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(S32,
Mask >> Shift));
2716 B.buildCopy(DstReg, LiveIn);
2731 if (!
Arg->isRegister() || !
Arg->getRegister().isValid())
2742 MI.eraseFromParent();
2750 LLT DstTy =
MRI.getType(Dst);
2780 auto FloatY =
B.buildUITOFP(S32,
Y);
2781 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2782 auto Scale =
B.buildFConstant(S32,
BitsToFloat(0x4f7ffffe));
2783 auto ScaledY =
B.buildFMul(S32, RcpIFlag, Scale);
2784 auto Z =
B.buildFPTOUI(S32, ScaledY);
2787 auto NegY =
B.buildSub(S32,
B.buildConstant(S32, 0),
Y);
2788 auto NegYZ =
B.buildMul(S32, NegY,
Z);
2789 Z =
B.buildAdd(S32,
Z,
B.buildUMulH(S32,
Z, NegYZ));
2792 auto Q =
B.buildUMulH(S32,
X,
Z);
2793 auto R =
B.buildSub(S32,
X,
B.buildMul(S32, Q,
Y));
2796 auto One =
B.buildConstant(S32, 1);
2799 Q =
B.buildSelect(S32,
Cond,
B.buildAdd(S32, Q, One), Q);
2800 R =
B.buildSelect(S32,
Cond,
B.buildSub(S32, R,
Y), R);
2805 B.buildSelect(DstReg,
Cond,
B.buildAdd(S32, Q, One), Q);
2807 B.buildSelect(DstReg,
Cond,
B.buildSub(S32, R,
Y), R);
2813 const bool IsDiv =
MI.getOpcode() == AMDGPU::G_UDIV;
2818 MI.eraseFromParent();
2838 auto Unmerge =
B.buildUnmerge(S32, Val);
2840 auto CvtLo =
B.buildUITOFP(S32, Unmerge.getReg(0));
2841 auto CvtHi =
B.buildUITOFP(S32, Unmerge.getReg(1));
2843 auto Mad =
B.buildFMAD(S32, CvtHi,
2844 B.buildFConstant(S32,
BitsToFloat(0x4f800000)), CvtLo);
2846 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2848 B.buildFMul(S32, Rcp,
B.buildFConstant(S32,
BitsToFloat(0x5f7ffffc)));
2852 B.buildFMul(S32, Mul1,
B.buildFConstant(S32,
BitsToFloat(0x2f800000)));
2853 auto Trunc =
B.buildIntrinsicTrunc(S32, Mul2);
2856 auto Mad2 =
B.buildFMAD(S32, Trunc,
2859 auto ResultLo =
B.buildFPTOUI(S32, Mad2);
2860 auto ResultHi =
B.buildFPTOUI(S32, Trunc);
2862 return {ResultLo.getReg(0), ResultHi.getReg(0)};
2877 auto Rcp =
B.buildMerge(S64, {RcpLo, RcpHi});
2879 auto Zero64 =
B.buildConstant(S64, 0);
2880 auto NegDenom =
B.buildSub(S64, Zero64, Denom);
2882 auto MulLo1 =
B.buildMul(S64, NegDenom, Rcp);
2883 auto MulHi1 =
B.buildUMulH(S64, Rcp, MulLo1);
2885 auto UnmergeMulHi1 =
B.buildUnmerge(S32, MulHi1);
2886 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2887 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2889 auto Add1_Lo =
B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2890 auto Add1_Hi =
B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2891 auto Add1_HiNc =
B.buildAdd(S32, RcpHi, MulHi1_Hi);
2892 auto Add1 =
B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2894 auto MulLo2 =
B.buildMul(S64, NegDenom, Add1);
2895 auto MulHi2 =
B.buildUMulH(S64, Add1, MulLo2);
2896 auto UnmergeMulHi2 =
B.buildUnmerge(S32, MulHi2);
2897 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2898 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2900 auto Zero32 =
B.buildConstant(S32, 0);
2901 auto Add2_Lo =
B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2903 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2904 auto Add2_Hi =
B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2905 auto Add2 =
B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2907 auto UnmergeNumer =
B.buildUnmerge(S32, Numer);
2908 Register NumerLo = UnmergeNumer.getReg(0);
2909 Register NumerHi = UnmergeNumer.getReg(1);
2911 auto MulHi3 =
B.buildUMulH(S64, Numer, Add2);
2912 auto Mul3 =
B.buildMul(S64, Denom, MulHi3);
2913 auto UnmergeMul3 =
B.buildUnmerge(S32, Mul3);
2914 Register Mul3_Lo = UnmergeMul3.getReg(0);
2915 Register Mul3_Hi = UnmergeMul3.getReg(1);
2916 auto Sub1_Lo =
B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2917 auto Sub1_Hi =
B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2918 auto Sub1_Mi =
B.buildSub(S32, NumerHi, Mul3_Hi);
2919 auto Sub1 =
B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2921 auto UnmergeDenom =
B.buildUnmerge(S32, Denom);
2922 Register DenomLo = UnmergeDenom.getReg(0);
2923 Register DenomHi = UnmergeDenom.getReg(1);
2926 auto C1 =
B.buildSExt(S32, CmpHi);
2929 auto C2 =
B.buildSExt(S32, CmpLo);
2932 auto C3 =
B.buildSelect(S32, CmpEq, C2, C1);
2939 auto Sub2_Lo =
B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2940 auto Sub2_Mi =
B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2941 auto Sub2_Hi =
B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2942 auto Sub2 =
B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2944 auto One64 =
B.buildConstant(S64, 1);
2945 auto Add3 =
B.buildAdd(S64, MulHi3, One64);
2951 auto C6 =
B.buildSelect(
2955 auto Add4 =
B.buildAdd(S64, Add3, One64);
2956 auto Sub3_Lo =
B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2958 auto Sub3_Mi =
B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2959 auto Sub3_Hi =
B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2960 auto Sub3 =
B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2966 auto Sel1 =
B.buildSelect(
2968 B.buildSelect(DstReg,
2971 auto Sel2 =
B.buildSelect(
2973 B.buildSelect(DstReg,
2983 const bool IsDiv =
MI.getOpcode() == AMDGPU::G_UDIV;
2987 LLT Ty =
MRI.getType(DstReg);
2996 MI.eraseFromParent();
3008 const LLT Ty =
MRI.getType(DstReg);
3009 if (Ty != S32 && Ty != S64)
3012 const bool IsDiv =
MI.getOpcode() == AMDGPU::G_SDIV;
3017 auto SignBitOffset =
B.buildConstant(S32, Ty.
getSizeInBits() - 1);
3018 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
3019 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
3021 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
3022 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
3024 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
3025 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
3027 Register UDivRem =
MRI.createGenericVirtualRegister(Ty);
3035 Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
3037 Sign = LHSign.getReg(0);
3039 UDivRem =
B.buildXor(Ty, UDivRem, Sign).getReg(0);
3040 B.buildSub(DstReg, UDivRem, Sign);
3042 MI.eraseFromParent();
3053 LLT ResTy =
MRI.getType(Res);
3059 if (!AllowInaccurateRcp)
3064 if (CLHS->isExactlyValue(1.0)) {
3065 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res,
false)
3069 MI.eraseFromParent();
3074 if (CLHS->isExactlyValue(-1.0)) {
3075 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
3076 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res,
false)
3077 .addUse(FNeg.getReg(0))
3080 MI.eraseFromParent();
3086 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy},
false)
3089 B.buildFMul(Res, LHS,
RCP, Flags);
3091 MI.eraseFromParent();
3107 auto LHSExt =
B.buildFPExt(S32, LHS, Flags);
3108 auto RHSExt =
B.buildFPExt(S32, RHS, Flags);
3110 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32},
false)
3111 .addUse(RHSExt.getReg(0))
3114 auto QUOT =
B.buildFMul(S32, LHSExt,
RCP, Flags);
3115 auto RDst =
B.buildFPTrunc(S16, QUOT, Flags);
3117 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res,
false)
3118 .addUse(RDst.getReg(0))
3123 MI.eraseFromParent();
3134 unsigned SPDenormMode =
3137 if (
ST.hasDenormModeInst()) {
3139 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
3141 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3142 B.buildInstr(AMDGPU::S_DENORM_MODE)
3143 .addImm(NewDenormModeValue);
3151 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3152 .addImm(SPDenormMode)
3153 .addImm(SPDenormModeBitField);
3171 auto One =
B.buildFConstant(S32, 1.0f);
3173 auto DenominatorScaled =
3174 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1},
false)
3179 auto NumeratorScaled =
3180 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1},
false)
3186 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32},
false)
3187 .addUse(DenominatorScaled.getReg(0))
3189 auto NegDivScale0 =
B.buildFNeg(S32, DenominatorScaled, Flags);
3193 if (!
Mode.allFP32Denormals())
3196 auto Fma0 =
B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3197 auto Fma1 =
B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3198 auto Mul =
B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3199 auto Fma2 =
B.buildFMA(S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
3200 auto Fma3 =
B.buildFMA(S32, Fma2, Fma1,
Mul, Flags);
3201 auto Fma4 =
B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3203 if (!
Mode.allFP32Denormals())
3206 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32},
false)
3207 .addUse(Fma4.getReg(0))
3208 .addUse(Fma1.getReg(0))
3209 .addUse(Fma3.getReg(0))
3210 .addUse(NumeratorScaled.getReg(1))
3213 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res,
false)
3214 .addUse(Fmas.getReg(0))
3219 MI.eraseFromParent();
3235 auto One =
B.buildFConstant(S64, 1.0);
3237 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1},
false)
3243 auto NegDivScale0 =
B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3245 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64},
false)
3246 .addUse(DivScale0.getReg(0))
3249 auto Fma0 =
B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3250 auto Fma1 =
B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3251 auto Fma2 =
B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3253 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1},
false)
3259 auto Fma3 =
B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3260 auto Mul =
B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3261 auto Fma4 =
B.buildFMA(S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
3270 auto NumUnmerge =
B.buildUnmerge(S32, LHS);
3271 auto DenUnmerge =
B.buildUnmerge(S32, RHS);
3272 auto Scale0Unmerge =
B.buildUnmerge(S32, DivScale0);
3273 auto Scale1Unmerge =
B.buildUnmerge(S32, DivScale1);
3276 Scale1Unmerge.getReg(1));
3278 Scale0Unmerge.getReg(1));
3279 Scale =
B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3281 Scale = DivScale1.getReg(1);
3284 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64},
false)
3285 .addUse(Fma4.getReg(0))
3286 .addUse(Fma3.getReg(0))
3287 .addUse(
Mul.getReg(0))
3291 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
makeArrayRef(Res),
false)
3292 .addUse(Fmas.getReg(0))
3297 MI.eraseFromParent();
3312 auto Abs =
B.buildFAbs(S32, RHS, Flags);
3315 auto C0 =
B.buildConstant(S32, 0x6f800000);
3316 auto C1 =
B.buildConstant(S32, 0x2f800000);
3320 auto Sel =
B.buildSelect(S32, CmpRes, C1, C2, Flags);
3322 auto Mul0 =
B.buildFMul(S32, RHS, Sel, Flags);
3324 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32},
false)
3325 .addUse(Mul0.getReg(0))
3328 auto Mul1 =
B.buildFMul(S32, LHS,
RCP, Flags);
3330 B.buildFMul(Res, Sel, Mul1, Flags);
3332 MI.eraseFromParent();
3350 auto Flags =
MI.getFlags();
3352 LLT Ty =
MRI.getType(Dst);
3362 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty},
false)
3372 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3373 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3378 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3380 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3381 MI.eraseFromParent();
3387 case Intrinsic::amdgcn_ds_fadd:
3388 return AMDGPU::G_ATOMICRMW_FADD;
3389 case Intrinsic::amdgcn_ds_fmin:
3390 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3391 case Intrinsic::amdgcn_ds_fmax:
3392 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3408 for (
int I = 6;
I > 3; --
I)
3409 MI.RemoveOperand(
I);
3411 MI.RemoveOperand(1);
3422 LLT DstTy =
MRI.getType(DstReg);
3425 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
3431 B.buildPtrAdd(DstReg, KernargPtrReg,
B.buildConstant(IdxTy,
Offset).getReg(0));
3448 MI.eraseFromParent();
3455 unsigned AddrSpace)
const {
3457 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32),
MI.getOperand(2).getReg());
3461 MI.eraseFromParent();
3471 std::tuple<Register, unsigned, unsigned>
3474 const unsigned MaxImm = 4095;
3476 unsigned TotalConstOffset;
3479 std::tie(BaseReg, TotalConstOffset) =
3482 unsigned ImmOffset = TotalConstOffset;
3491 unsigned Overflow = ImmOffset & ~MaxImm;
3492 ImmOffset -= Overflow;
3493 if ((int32_t)Overflow < 0) {
3494 Overflow += ImmOffset;
3498 if (Overflow != 0) {
3500 BaseReg =
B.buildConstant(S32, Overflow).getReg(0);
3502 auto OverflowVal =
B.buildConstant(S32, Overflow);
3503 BaseReg =
B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3508 BaseReg =
B.buildConstant(S32, 0).getReg(0);
3510 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3517 bool ImageStore)
const {
3524 auto Unmerge =
B.buildUnmerge(S16,
Reg);
3527 for (
int I = 0,
E = Unmerge->getNumOperands() - 1;
I !=
E; ++
I)
3532 return B.buildBuildVector(
LLT::vector(NumElts, S32), WideRegs).getReg(0);
3538 Reg =
B.buildBitcast(S32,
Reg).getReg(0);
3540 PackedRegs.
resize(2,
B.buildUndef(S32).getReg(0));
3541 return B.buildBuildVector(
LLT::vector(2, S32), PackedRegs).getReg(0);
3546 auto Unmerge =
B.buildUnmerge(S16,
Reg);
3547 for (
int I = 0,
E = Unmerge->getNumOperands() - 1;
I !=
E; ++
I)
3549 PackedRegs.
resize(6,
B.buildUndef(S16).getReg(0));
3557 auto Unmerge =
B.buildUnmerge(S32,
Reg);
3558 for (
int I = 0,
E = Unmerge->getNumOperands() - 1;
I !=
E; ++
I)
3560 PackedRegs.
resize(4,
B.buildUndef(S32).getReg(0));
3561 return B.buildBuildVector(
LLT::vector(4, S32), PackedRegs).getReg(0);
3573 LLT Ty =
MRI->getType(VData);
3597 bool IsFormat)
const {
3599 LLT Ty =
MRI.getType(VData);
3601 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
3608 const int MemSize = MMO->
getSize();
3611 unsigned TotalOffset;
3614 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3617 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
3621 VIndex =
MI.getOperand(3).getReg();
3625 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
3626 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
3630 Format =
MI.getOperand(5 + OpOffset).getImm();
3634 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
3637 if (TotalOffset != 0)
3638 MMO =
B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3642 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3643 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3644 }
else if (IsFormat) {
3645 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3646 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3650 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3653 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3656 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3662 VIndex =
B.buildConstant(S32, 0).getReg(0);
3664 auto MIB =
B.buildInstr(Opc)
3675 MIB.addImm(AuxiliaryData)
3676 .addImm(HasVIndex ? -1 : 0)
3677 .addMemOperand(MMO);
3679 MI.eraseFromParent();
3687 bool IsTyped)
const {
3690 const int MemSize = MMO->
getSize();
3697 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3700 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
3704 VIndex =
MI.getOperand(3).getReg();
3708 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
3709 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
3713 Format =
MI.getOperand(5 + OpOffset).getImm();
3717 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
3719 unsigned TotalOffset;
3721 LLT Ty =
MRI.getType(Dst);
3723 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
3727 if (TotalOffset != 0)
3728 MMO =
B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3733 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3734 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3735 }
else if (IsFormat) {
3736 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3737 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3741 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3744 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3747 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3754 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.
isVector());
3758 LoadDstReg =
B.getMRI()->createGenericVirtualRegister(S32);
3759 else if (Unpacked && IsD16 && Ty.
isVector())
3760 LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3765 VIndex =
B.buildConstant(S32, 0).getReg(0);
3767 auto MIB =
B.buildInstr(Opc)
3778 MIB.addImm(AuxiliaryData)
3779 .addImm(HasVIndex ? -1 : 0)
3780 .addMemOperand(MMO);
3782 if (LoadDstReg != Dst) {
3783 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
3787 B.buildTrunc(Dst, LoadDstReg);
3791 auto Unmerge =
B.buildUnmerge(S32, LoadDstReg);
3793 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
3795 B.buildMerge(Dst, Repack);
3799 MI.eraseFromParent();
3806 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3807 AMDGPU::G_AMDGPU_ATOMIC_DEC;
3809 .addDef(
MI.getOperand(0).getReg())
3810 .addUse(
MI.getOperand(2).getReg())
3811 .addUse(
MI.getOperand(3).getReg())
3813 MI.eraseFromParent();
3819 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3820 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3821 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3822 case Intrinsic::amdgcn_raw_buffer_atomic_add:
3823 case Intrinsic::amdgcn_struct_buffer_atomic_add:
3824 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3825 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3826 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3827 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3828 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3829 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3830 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3831 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3832 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3833 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3834 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3835 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3836 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3837 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3838 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3839 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3840 case Intrinsic::amdgcn_raw_buffer_atomic_and:
3841 case Intrinsic::amdgcn_struct_buffer_atomic_and:
3842 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3843 case Intrinsic::amdgcn_raw_buffer_atomic_or:
3844 case Intrinsic::amdgcn_struct_buffer_atomic_or:
3845 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3846 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3847 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3848 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3849 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3850 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3851 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3852 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3853 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3854 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3855 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3856 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3857 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3858 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3859 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3860 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3869 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3870 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3871 const bool HasReturn =
MI.getNumExplicitDefs() != 0;
3878 Dst =
MI.getOperand(0).getReg();
3883 Register VData =
MI.getOperand(2 + OpOffset).getReg();
3887 CmpVal =
MI.getOperand(3 + OpOffset).getReg();
3891 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
3892 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3895 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
3898 VIndex =
MI.getOperand(4 + OpOffset).getReg();
3902 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
3903 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
3904 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
3909 unsigned TotalOffset;
3911 if (TotalOffset != 0)
3912 MMO =
B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->
getSize());
3915 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
3932 .addImm(AuxiliaryData)
3933 .addImm(HasVIndex ? -1 : 0)
3934 .addMemOperand(MMO);
3936 MI.eraseFromParent();
3949 for (
unsigned I =
Intr->VAddrStart;
I < EndIdx;
I++) {
3956 if (I < Intr->GradientStart) {
3957 AddrReg =
B.buildBitcast(V2S16, AddrReg).getReg(0);
3962 if (((
I + 1) >= EndIdx) ||
3963 ((
Intr->NumGradients / 2) % 2 == 1 &&
3964 (
I == static_cast<unsigned>(
Intr->GradientStart +
3965 (
Intr->NumGradients / 2) - 1) ||
3966 I == static_cast<unsigned>(
Intr->GradientStart +
3967 Intr->NumGradients - 1))) ||
3969 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
3971 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3976 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
3987 int DimIdx,
int NumVAddrs) {
3991 for (
int I = 0;
I != NumVAddrs; ++
I) {
3993 if (
SrcOp.isReg()) {
3999 int NumAddrRegs = AddrRegs.
size();
4000 if (NumAddrRegs != 1) {
4005 auto Undef =
B.buildUndef(S32);
4006 AddrRegs.
append(RoundedNumRegs - NumAddrRegs,
Undef.getReg(0));
4007 NumAddrRegs = RoundedNumRegs;
4010 auto VAddr =
B.buildBuildVector(
LLT::vector(NumAddrRegs, 32), AddrRegs);
4011 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4014 for (
int I = 1;
I != NumVAddrs; ++
I) {
4017 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
4038 const unsigned NumDefs =
MI.getNumExplicitDefs();
4039 const unsigned ArgOffset = NumDefs + 1;
4040 bool IsTFE = NumDefs == 2;
4057 MRI->getType(
MI.getOperand(ArgOffset +
Intr->GradientStart).getReg());
4059 MRI->getType(
MI.getOperand(ArgOffset +
Intr->CoordStart).getReg());
4060 const bool IsG16 = GradTy == S16;
4061 const bool IsA16 = AddrTy == S16;
4064 if (!BaseOpcode->
Atomic) {
4065 DMask =
MI.getOperand(ArgOffset +
Intr->DMaskIndex).getImm();
4068 }
else if (DMask != 0) {
4070 }
else if (!IsTFE && !BaseOpcode->
Store) {
4072 B.buildUndef(
MI.getOperand(0));
4073 MI.eraseFromParent();
4081 unsigned NewOpcode = NumDefs == 0 ?
4082 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4085 MI.setDesc(
B.getTII().get(NewOpcode));
4089 if (IsTFE && DMask == 0) {
4092 MI.getOperand(ArgOffset +
Intr->DMaskIndex).setImm(DMask);
4095 if (BaseOpcode->
Atomic) {
4097 LLT Ty =
MRI->getType(VData0);
4107 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
4108 MI.getOperand(2).setReg(
Concat.getReg(0));
4109 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4113 unsigned CorrectedNumVAddrs =
Intr->NumVAddrs;
4129 --CorrectedNumVAddrs;
4131 MI.getOperand(
MI.getNumExplicitDefs())
4132 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->
Intr));
4133 MI.RemoveOperand(ArgOffset +
Intr->LodIndex);
4134 Intr = NewImageDimIntr;
4141 int64_t ConstantLod;
4144 if (ConstantLod == 0) {
4147 MI.getOperand(ArgOffset +
Intr->MipIndex).ChangeToImmediate(0);
4148 --CorrectedNumVAddrs;
4154 if (IsA16 || IsG16) {
4157 if (!ST.
hasA16() || !IsG16)
4162 if (
Intr->NumVAddrs > 1) {
4165 const int PackEndIdx = IsA16 ?
Intr->VAddrEnd :
Intr->CoordStart;
4171 for (
unsigned I =
Intr->CoordStart; I < Intr->VAddrEnd;
I++) {
4172 int AddrReg =
MI.getOperand(ArgOffset +
I).getReg();
4181 if (!UseNSA && PackedRegs.
size() > 1) {
4183 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
4184 PackedRegs[0] =
Concat.getReg(0);
4188 const unsigned NumPacked = PackedRegs.
size();
4189 for (
unsigned I =
Intr->VAddrStart; I < Intr->VAddrEnd;
I++) {
4191 if (!
SrcOp.isReg()) {
4198 if (
I -
Intr->VAddrStart < NumPacked)
4199 SrcOp.setReg(PackedRegs[
I -
Intr->VAddrStart]);
4201 SrcOp.setReg(AMDGPU::NoRegister);
4216 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.
hasNSAEncoding();
4218 if (!UseNSA &&
Intr->NumVAddrs > 1)
4230 if (BaseOpcode->
Store) {
4233 LLT Ty =
MRI->getType(VData);
4238 if (RepackedReg != VData) {
4239 MI.getOperand(1).setReg(RepackedReg);
4246 LLT Ty =
MRI->getType(DstReg);
4252 if (NumElts < DMaskLanes)
4255 if (NumElts > 4 || DMaskLanes > 4)
4258 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4278 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
4279 unsigned RoundedSize = 32 * RoundedElts;
4282 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4287 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
4293 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
4297 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4298 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
4300 Register NewResultReg =
MRI->createGenericVirtualRegister(LoadResultTy);
4302 MI.getOperand(0).setReg(NewResultReg);
4310 Dst1Reg =
MI.getOperand(1).getReg();
4311 if (
MRI->getType(Dst1Reg) != S32)
4315 MI.RemoveOperand(1);
4319 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4328 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4330 if (ResultNumRegs == 1) {
4332 ResultRegs[0] = NewResultReg;
4335 for (
int I = 0;
I != NumDataRegs; ++
I)
4336 ResultRegs[
I] =
MRI->createGenericVirtualRegister(RegTy);
4337 B.buildUnmerge(ResultRegs, NewResultReg);
4342 ResultRegs.
resize(NumDataRegs);
4348 B.buildTrunc(DstReg, ResultRegs[0]);
4354 B.buildBitcast(DstReg, ResultRegs[0]);
4368 Reg =
B.buildBitcast(V2S16,
Reg).getReg(0);
4371 Reg =
B.buildTrunc(S16,
Reg).getReg(0);
4375 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
4379 for (
int I = 0;
I != NumElts; ++
I)
4384 LLT ResTy =
MRI->getType(ResultRegs[0]);
4386 padWithUndef(ResTy, NumElts - ResultRegs.
size());
4387 B.buildBuildVector(DstReg, ResultRegs);
4397 padWithUndef(ResTy, RegsToCover - ResultRegs.
size() + 1);
4399 B.buildUnmerge({DstReg,
MRI->createGenericVirtualRegister(V3S16)},
Concat);
4403 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
4404 B.buildConcatVectors(DstReg, ResultRegs);
4414 LLT Ty =
B.getMRI()->getType(Dst);
4423 Dst =
MI.getOperand(0).getReg();
4424 B.setInsertPt(
B.getMBB(),
MI);
4430 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4431 MI.RemoveOperand(1);
4435 const unsigned MemSize = (
Size + 7) / 8;
4436 const Align MemAlign(4);
4442 MI.addMemOperand(MF, MMO);
4465 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4476 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4477 B.buildCopy(SGPR01, LiveIn);
4478 B.buildInstr(AMDGPU::S_TRAP)
4483 MI.eraseFromParent();
4494 "debugtrap handler not supported",
4496 LLVMContext &Ctx =
B.getMF().getFunction().getContext();
4497 Ctx.diagnose(NoTrap);
4503 MI.eraseFromParent();
4514 Register NodePtr =
MI.getOperand(2).getReg();
4515 Register RayExtent =
MI.getOperand(3).getReg();
4516 Register RayOrigin =
MI.getOperand(4).getReg();
4518 Register RayInvDir =
MI.getOperand(6).getReg();
4521 bool IsA16 =
MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4522 bool Is64 =
MRI.getType(NodePtr).getSizeInBits() == 64;
4523 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4524 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4525 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4526 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4530 auto Unmerge =
B.buildUnmerge({S32, S32}, NodePtr);
4538 auto packLanes = [&Ops, &S32, &
B] (
Register Src) {
4539 auto Unmerge =
B.buildUnmerge({S32, S32, S32, S32}, Src);
4545 packLanes(RayOrigin);
4547 auto UnmergeRayDir =
B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4548 auto UnmergeRayInvDir =
B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4549 Register R1 =
MRI.createGenericVirtualRegister(S32);
4551 Register R3 =
MRI.createGenericVirtualRegister(S32);
4552 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4553 B.buildMerge(
R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4554 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4560 packLanes(RayInvDir);
4563 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4572 .addImm(IsA16 ? 1 : 0)
4575 MI.eraseFromParent();
4585 auto IntrID =
MI.getIntrinsicID();
4587 case Intrinsic::amdgcn_if:
4588 case Intrinsic::amdgcn_else: {
4591 bool Negated =
false;
4595 = static_cast<const SIRegisterInfo *>(
MRI.getTargetRegisterInfo());
4603 std::swap(CondBrTarget, UncondBrTarget);
4605 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
4606 if (IntrID == Intrinsic::amdgcn_if) {
4607 B.buildInstr(AMDGPU::SI_IF)
4610 .addMBB(UncondBrTarget);
4612 B.buildInstr(AMDGPU::SI_ELSE)
4615 .addMBB(UncondBrTarget);
4624 B.buildBr(*CondBrTarget);
4627 MRI.setRegClass(
Def,
TRI->getWaveMaskRegClass());
4628 MRI.setRegClass(
Use,
TRI->getWaveMaskRegClass());
4629 MI.eraseFromParent();
4630 BrCond->eraseFromParent();
4636 case Intrinsic::amdgcn_loop: {
4639 bool Negated =
false;
4643 = static_cast<const SIRegisterInfo *>(
MRI.getTargetRegisterInfo());
4649 std::swap(CondBrTarget, UncondBrTarget);
4651 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
4652 B.buildInstr(AMDGPU::SI_LOOP)
4654 .addMBB(UncondBrTarget);
4659 B.buildBr(*CondBrTarget);
4661 MI.eraseFromParent();
4662 BrCond->eraseFromParent();
4663 MRI.setRegClass(
Reg,
TRI->getWaveMaskRegClass());
4669 case Intrinsic::amdgcn_kernarg_segment_ptr:
4672 B.buildConstant(
MI.getOperand(0).getReg(), 0);
4673 MI.eraseFromParent();
4679 case Intrinsic::amdgcn_implicitarg_ptr:
4681 case Intrinsic::amdgcn_workitem_id_x:
4684 case Intrinsic::amdgcn_workitem_id_y:
4687 case Intrinsic::amdgcn_workitem_id_z:
4690 case Intrinsic::amdgcn_workgroup_id_x:
4693 case Intrinsic::amdgcn_workgroup_id_y:
4696 case Intrinsic::amdgcn_workgroup_id_z:
4699 case Intrinsic::amdgcn_dispatch_ptr:
4702 case Intrinsic::amdgcn_queue_ptr:
4705 case Intrinsic::amdgcn_implicit_buffer_ptr:
4708 case Intrinsic::amdgcn_dispatch_id:
4711 case Intrinsic::amdgcn_fdiv_fast:
4713 case Intrinsic::amdgcn_is_shared:
4715 case Intrinsic::amdgcn_is_private:
4717 case Intrinsic::amdgcn_wavefrontsize: {
4719 MI.eraseFromParent();
4722 case Intrinsic::amdgcn_s_buffer_load:
4724 case Intrinsic::amdgcn_raw_buffer_store:
4725 case Intrinsic::amdgcn_struct_buffer_store:
4727 case Intrinsic::amdgcn_raw_buffer_store_format:
4728 case Intrinsic::amdgcn_struct_buffer_store_format:
4730 case Intrinsic::amdgcn_raw_tbuffer_store:
4731 case Intrinsic::amdgcn_struct_tbuffer_store:
4733 case Intrinsic::amdgcn_raw_buffer_load:
4734 case Intrinsic::amdgcn_struct_buffer_load:
4736 case Intrinsic::amdgcn_raw_buffer_load_format:
4737 case Intrinsic::amdgcn_struct_buffer_load_format:
4739 case Intrinsic::amdgcn_raw_tbuffer_load:
4740 case Intrinsic::amdgcn_struct_tbuffer_load:
4742 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4743 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4744 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4745 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4746 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4747 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4748 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4749 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4750 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4751 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4752 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4753 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4754 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4755 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4756 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4757 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4758 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4759 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4760 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4761 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4762 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4763 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4764 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4765 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4766 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4767 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4768 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4769 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4771 case Intrinsic::amdgcn_atomic_inc:
4773 case Intrinsic::amdgcn_atomic_dec:
4775 case Intrinsic::trap:
4777 case Intrinsic::debugtrap:
4779 case Intrinsic::amdgcn_rsq_clamp:
4781 case Intrinsic::amdgcn_ds_fadd:
4782 case Intrinsic::amdgcn_ds_fmin:
4783 case Intrinsic::amdgcn_ds_fmax:
4785 case Intrinsic::amdgcn_image_bvh_intersect_ray:
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
bool shouldEmitFixup(const GlobalValue *GV) const
static LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
Diagnostic information for unsupported feature in backend.
Wrapper class representing physical registers. Should be passed by value.
MachineBasicBlock * getMBB() const
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
static bool hasDefinedInitializer(const GlobalValue *GV)
Optional< ValueAndVReg > getConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool HandleFConstants=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_F/CONSTANT (LookThro...
This class represents lattice values for constants.
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Not emitted register (e.g. carry, or temporary result).
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
unsigned getScalarSizeInBits() const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
Address space for constant memory (VTX2).
Address space for local memory.
void push_back(const T &Elt)
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
bool hasTrigReducedRange() const
float BitsToFloat(uint32_t Bits)
This function takes a 32-bit integer and returns the bit equivalent float.
LLT getScalarType() const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
uint64_t getSize() const
Return the size in bytes of the memory reference.
LLVM_NODISCARD detail::scope_exit< typename std::decay< Callable >::type > make_scope_exit(Callable &&F)
const SIInstrInfo * getInstrInfo() const override
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx)
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
unsigned const TargetRegisterInfo * TRI
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalizeRuleSet & scalarize(unsigned TypeIdx)
bool legalizeDebugTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_READONLY const MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
const ImageDimIntrinsicInfo * getImageDimInstrinsicByBaseOpcode(unsigned BaseOpcode, unsigned Dim)
Function & getFunction()
Return the LLVM function that this machine code represents.
void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
Value of the register doesn't matter.
#define FP_DENORM_FLUSH_NONE
static LLT getPow2ScalarType(LLT Ty)
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
A description of a memory reference used in the backend.
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, bool IsFormat) const
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool isEntryFunction() const
bool legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
A Use represents the edge between a Value definition and its users.
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Bitwise or logical AND of integers.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
The memory access is dereferenceable (i.e., doesn't trap).
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isTrapHandlerEnabled() const
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
Address space for 32-bit constant memory.
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Align commonAlignment(Align A, Align B)
Returns the alignment that satisfies both alignments.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
0 1 1 1 True if ordered (no nans)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const unsigned MemSizeInBits)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool hasExternalLinkage() const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static bool isRegisterType(LLT Ty)
bool isFlatGlobalAddrSpace(unsigned AS)
int64_t getSExtValue() const
Get sign extended value.
LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg)
Returns base register and constant offset.
bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B, bool IsInc) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool isNegative() const
Return true if the sign bit is set.
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, AMDGPU::SIModeRegisterDefaults Mode)
LegalizeResult lowerShuffleVector(MachineInstr &MI)
unsigned greater or equal
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
MachineFunction & getMF()
Getter for the function we currently build.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static const fltSemantics & IEEEdouble() LLVM_READNONE
static LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
uint64_t value() const
This is a hole in the type system and should not be abused.
void legalizeUDIV_UREM64Impl(MachineIRBuilder &B, Register DstReg, Register Numer, Register Denom, bool IsDiv) const
Address space for private memory.
Register widenWithUnmerge(LLT WideTy, Register OrigReg)
Widen OrigReg to WideTy by merging to a wider type, padding with G_IMPLICIT_DEF, and producing dead r...
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static constexpr int Concat[]
Analysis containing CSE Info
uint32_t FloatToBits(float Float)
This function takes a float and returns the bit equivalent 32-bit integer.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSDIV_SREM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
initializer< Ty > init(const Ty &Val)
Abstract class that contains various methods for clients to notify about changes.
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
unsigned const MachineRegisterInfo * MRI
static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
The instances of the Type class are immutable: once they are created, they are never changed.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
This is an important class for using LLVM in a threaded context.
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &am