28#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include "llvm/IR/IntrinsicsR600.h"
31#define DEBUG_TYPE "amdgpu-legalinfo"
34using namespace LegalizeActions;
35using namespace LegalizeMutations;
36using namespace LegalityPredicates;
37using namespace MIPatternMatch;
41 "amdgpu-global-isel-new-legality",
42 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
43 "rules compatible with selection patterns"),
68 const LLT Ty = Query.Types[TypeIdx];
75 EltSize > 1 && EltSize < 32 &&
82 const LLT Ty = Query.Types[TypeIdx];
89 const LLT Ty = Query.Types[TypeIdx];
97 const LLT Ty = Query.Types[TypeIdx];
99 return std::pair(TypeIdx,
106 const LLT Ty = Query.Types[TypeIdx];
109 unsigned Pieces = (
Size + 63) / 64;
120 const LLT Ty = Query.Types[TypeIdx];
125 const int NextMul32 = (
Size + 31) / 32;
129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
148 const LLT Ty = Query.Types[TypeIdx];
155 const LLT Ty = Query.Types[TypeIdx];
165 const LLT QueryTy = Query.Types[TypeIdx];
172 const LLT QueryTy = Query.Types[TypeIdx];
179 const LLT QueryTy = Query.Types[TypeIdx];
190 return EltSize == 16 || EltSize % 32 == 0;
195 return EltSize == 32 || EltSize == 64 ||
197 EltSize == 128 || EltSize == 256;
220 const LLT QueryTy = Query.Types[TypeIdx];
232 const LLT Ty = Query.Types[TypeIdx];
234 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.
getSizeInBits();
246 return ST.enableFlatScratch() ? 128 : 32;
248 return ST.useDS128() ? 128 : 64;
258 return IsLoad ? 512 : 128;
271 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
276 unsigned AS = Query.
Types[1].getAddressSpace();
290 if (IsLoad && MemSize <
Size)
291 MemSize = std::max(MemSize,
Align);
309 if (!ST.hasDwordx3LoadStores())
322 if (AlignBits < MemSize) {
325 Align(AlignBits / 8)))
351 return EltSize != 32 && EltSize != 64;
366 if (
Size != MemSizeInBits)
382 uint64_t AlignInBits,
unsigned AddrSpace,
392 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
403 if (AlignInBits < RoundedSize)
410 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
417 if (Query.
MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
422 Query.
Types[1].getAddressSpace(), Opcode);
428 using namespace TargetOpcode;
430 auto GetAddrSpacePtr = [&
TM](
unsigned AS) {
474 std::initializer_list<LLT> AllS32Vectors =
475 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
476 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
477 std::initializer_list<LLT> AllS64Vectors =
478 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
488 const LLT CodePtr = FlatPtr;
490 const std::initializer_list<LLT> AddrSpaces64 = {
491 GlobalPtr, ConstantPtr, FlatPtr
494 const std::initializer_list<LLT> AddrSpaces32 = {
495 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
498 const std::initializer_list<LLT> FPTypesBase = {
502 const std::initializer_list<LLT> FPTypes16 = {
506 const std::initializer_list<LLT> FPTypesPK16 = {
518 .
legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
519 .legalFor(AllS32Vectors)
533 .legalFor({S32, S16, V2S16})
534 .clampMaxNumElementsStrict(0, S16, 2)
542 .clampMaxNumElementsStrict(0, S16, 2)
550 .legalFor({S32, S16, V2S16})
551 .minScalarOrElt(0, S16)
558 .legalFor({S32, S16})
568 .widenScalarToNextMultipleOf(0, 32)
575 .legalFor({S32, S16})
590 .widenScalarToNextMultipleOf(0, 32)
598 .widenScalarToNextMultipleOf(0, 32);
603 Mul.maxScalar(0, S32);
609 .minScalarOrElt(0, S32)
628 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
629 .customFor({S32, S64})
630 .clampScalar(0, S32, S64)
640 .clampMaxNumElements(0, S8, 2)
651 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
652 .clampScalar(0, S32, S64)
659 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
660 .legalFor({{S32, S1}, {S32, S32}})
661 .clampScalar(0, S32, S32)
671 .
legalFor({S1, S32, S64, S16, GlobalPtr,
672 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
679 .clampScalar(0, S16, S64);
704 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
705 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
706 .legalFor({S32, S64});
708 .customFor({S32, S64});
723 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
757 .legalFor(FPTypesPK16)
764 .legalFor({S32, S64, S16})
766 .clampScalar(0, S16, S64);
771 .clampScalar(0, S32, S64);
776 .legalFor({S32, S64})
778 .clampScalar(0, S32, S64);
783 .clampScalar(0, S32, S64);
794 .narrowScalarFor({{S64, S16}},
changeTo(0, S32))
803 .lowerFor({S64, V2S16});
809 .lowerFor({S64, S16, V2S16});
819 FMad.customFor({S32, S16});
821 FMad.customFor({S32});
823 FMad.customFor({S16});
829 FRem.customFor({S16, S32, S64});
831 FRem.minScalar(0, S32)
832 .customFor({S32, S64});
840 .clampMaxNumElements(0, S16, 2)
848 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
849 {S32, S1}, {S64, S1}, {S16, S1}})
851 .clampScalar(0, S32, S64)
852 .widenScalarToNextPow2(1, 32);
856 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
867 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
868 .customFor({{S64, S32}, {S64, S64}})
869 .narrowScalarFor({{S64, S16}},
changeTo(0, S32));
880 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
886 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
890 if (
ST.has16BitInsts()) {
891 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
892 .legalFor({S16, S32, S64})
893 .clampScalar(0, S16, S64)
896 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
897 .legalFor({S32, S64})
898 .clampScalar(0, S32, S64)
901 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
904 .clampScalar(0, S32, S64)
908 getActionDefinitionsBuilder(G_PTR_ADD)
911 .scalarSameSizeAs(1, 0);
913 getActionDefinitionsBuilder(G_PTRMASK)
915 .scalarSameSizeAs(1, 0)
919 getActionDefinitionsBuilder(G_ICMP)
930 .legalForCartesianProduct(
931 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
932 .legalForCartesianProduct(
933 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
934 if (
ST.has16BitInsts()) {
935 CmpBuilder.legalFor({{S1, S16}});
939 .widenScalarToNextPow2(1)
940 .clampScalar(1, S32, S64)
944 getActionDefinitionsBuilder(G_FCMP)
945 .legalForCartesianProduct({S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
946 .widenScalarToNextPow2(1)
947 .clampScalar(1, S32, S64)
951 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
952 if (
ST.has16BitInsts())
953 Exp2Ops.legalFor({S32, S16});
955 Exp2Ops.legalFor({S32});
956 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
957 Exp2Ops.scalarize(0);
959 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
960 if (
ST.has16BitInsts())
961 ExpOps.customFor({{S32}, {S16}});
963 ExpOps.customFor({S32});
964 ExpOps.clampScalar(0, MinScalarFPTy, S32)
967 getActionDefinitionsBuilder(G_FPOWI)
968 .clampScalar(0, MinScalarFPTy, S32)
972 getActionDefinitionsBuilder(G_CTPOP)
973 .legalFor({{S32, S32}, {S32, S64}})
974 .clampScalar(0, S32, S32)
975 .widenScalarToNextPow2(1, 32)
976 .clampScalar(1, S32, S64)
978 .widenScalarToNextPow2(0, 32);
981 if (
ST.has16BitInsts())
982 getActionDefinitionsBuilder(G_IS_FPCLASS)
983 .legalForCartesianProduct({S1}, FPTypes16)
984 .widenScalarToNextPow2(1)
988 getActionDefinitionsBuilder(G_IS_FPCLASS)
989 .legalForCartesianProduct({S1}, FPTypesBase)
991 .widenScalarToNextPow2(1)
998 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1000 .clampScalar(0, S32, S32)
1001 .clampScalar(1, S32, S64)
1002 .widenScalarToNextPow2(0, 32)
1003 .widenScalarToNextPow2(1, 32)
1007 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1008 .legalFor({{S32, S32}, {S32, S64}})
1009 .clampScalar(0, S32, S32)
1010 .clampScalar(1, S32, S64)
1012 .widenScalarToNextPow2(0, 32)
1013 .widenScalarToNextPow2(1, 32);
1017 getActionDefinitionsBuilder(G_BITREVERSE)
1018 .legalFor({S32, S64})
1019 .clampScalar(0, S32, S64)
1021 .widenScalarToNextPow2(0);
1023 if (
ST.has16BitInsts()) {
1024 getActionDefinitionsBuilder(G_BSWAP)
1025 .legalFor({S16, S32, V2S16})
1026 .clampMaxNumElementsStrict(0, S16, 2)
1029 .widenScalarToNextPow2(0)
1030 .clampScalar(0, S16, S32)
1033 if (
ST.hasVOP3PInsts()) {
1034 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1035 .legalFor({S32, S16, V2S16})
1037 .clampMaxNumElements(0, S16, 2)
1039 .widenScalarToNextPow2(0)
1043 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1044 .legalFor({S32, S16})
1045 .widenScalarToNextPow2(0)
1052 getActionDefinitionsBuilder(G_BSWAP)
1057 .widenScalarToNextPow2(0)
1062 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1065 .widenScalarToNextPow2(0)
1070 getActionDefinitionsBuilder(G_INTTOPTR)
1072 .legalForCartesianProduct(AddrSpaces64, {S64})
1073 .legalForCartesianProduct(AddrSpaces32, {S32})
1086 getActionDefinitionsBuilder(G_PTRTOINT)
1088 .legalForCartesianProduct(AddrSpaces64, {S64})
1089 .legalForCartesianProduct(AddrSpaces32, {S32})
1102 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1106 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1107 bool IsLoad) ->
bool {
1111 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1123 unsigned NumRegs = (MemSize + 31) / 32;
1125 if (!
ST.hasDwordx3LoadStores())
1136 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1137 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1138 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1144 for (
unsigned Op : {G_LOAD, G_STORE}) {
1145 const bool IsStore =
Op == G_STORE;
1147 auto &Actions = getActionDefinitionsBuilder(Op);
1150 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1151 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1152 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1153 {S64, GlobalPtr, S64, GlobalAlign32},
1154 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1155 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1156 {S32, GlobalPtr, S8, GlobalAlign8},
1157 {S32, GlobalPtr, S16, GlobalAlign16},
1159 {S32, LocalPtr, S32, 32},
1160 {S64, LocalPtr, S64, 32},
1161 {V2S32, LocalPtr, V2S32, 32},
1162 {S32, LocalPtr, S8, 8},
1163 {S32, LocalPtr, S16, 16},
1164 {V2S16, LocalPtr, S32, 32},
1166 {S32, PrivatePtr, S32, 32},
1167 {S32, PrivatePtr, S8, 8},
1168 {S32, PrivatePtr, S16, 16},
1169 {V2S16, PrivatePtr, S32, 32},
1171 {S32, ConstantPtr, S32, GlobalAlign32},
1172 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1173 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1174 {S64, ConstantPtr, S64, GlobalAlign32},
1175 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1186 Actions.customIf(
typeIs(1, Constant32Ptr));
1212 return !Query.
Types[0].isVector() &&
1213 needToSplitMemOp(Query, Op == G_LOAD);
1215 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1220 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1223 if (DstSize > MemSize)
1229 if (MemSize > MaxSize)
1237 return Query.
Types[0].isVector() &&
1238 needToSplitMemOp(Query, Op == G_LOAD);
1240 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1254 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1255 if (MemSize > MaxSize) {
1259 if (MaxSize % EltSize == 0) {
1265 unsigned NumPieces = MemSize / MaxSize;
1269 if (NumPieces == 1 || NumPieces >= NumElts ||
1270 NumElts % NumPieces != 0)
1271 return std::pair(0, EltTy);
1279 return std::pair(0, EltTy);
1294 return std::pair(0, EltTy);
1298 .widenScalarToNextPow2(0)
1304 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1305 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1306 {S32, GlobalPtr, S16, 2 * 8},
1307 {S32, LocalPtr, S8, 8},
1308 {S32, LocalPtr, S16, 16},
1309 {S32, PrivatePtr, S8, 8},
1310 {S32, PrivatePtr, S16, 16},
1311 {S32, ConstantPtr, S8, 8},
1312 {S32, ConstantPtr, S16, 2 * 8}})
1318 if (
ST.hasFlatAddressSpace()) {
1319 ExtLoads.legalForTypesWithMemDesc(
1320 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1328 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1330 ExtLoads.clampScalar(0, S32, S32)
1331 .widenScalarToNextPow2(0)
1334 auto &Atomics = getActionDefinitionsBuilder(
1335 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1336 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1337 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1338 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1339 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1340 {S64, GlobalPtr}, {S64, LocalPtr},
1341 {S32, RegionPtr}, {S64, RegionPtr}});
1342 if (
ST.hasFlatAddressSpace()) {
1343 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1346 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1347 if (
ST.hasLDSFPAtomicAdd()) {
1348 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1349 if (
ST.hasGFX90AInsts())
1350 Atomic.legalFor({{S64, LocalPtr}});
1351 if (
ST.hasGFX940Insts())
1352 Atomic.legalFor({{V2S16, LocalPtr}});
1354 if (
ST.hasAtomicFaddInsts())
1355 Atomic.legalFor({{S32, GlobalPtr}});
1356 if (
ST.hasFlatAtomicFaddF32Inst())
1357 Atomic.legalFor({{S32, FlatPtr}});
1359 if (
ST.hasGFX90AInsts()) {
1372 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1373 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1374 {S32, FlatPtr}, {S64, FlatPtr}})
1375 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1376 {S32, RegionPtr}, {S64, RegionPtr}});
1380 getActionDefinitionsBuilder(G_SELECT)
1381 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1382 LocalPtr, FlatPtr, PrivatePtr,
1386 .clampScalar(0, S16, S64)
1390 .clampMaxNumElements(0, S32, 2)
1391 .clampMaxNumElements(0, LocalPtr, 2)
1392 .clampMaxNumElements(0, PrivatePtr, 2)
1394 .widenScalarToNextPow2(0)
1399 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1400 .legalFor({{S32, S32}, {S64, S32}});
1401 if (
ST.has16BitInsts()) {
1402 if (
ST.hasVOP3PInsts()) {
1403 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1404 .clampMaxNumElements(0, S16, 2);
1406 Shifts.legalFor({{S16, S16}});
1409 Shifts.widenScalarIf(
1414 const LLT AmountTy = Query.
Types[1];
1418 Shifts.maxScalarIf(
typeIs(0, S16), 1, S16);
1419 Shifts.clampScalar(1, S32, S32);
1420 Shifts.widenScalarToNextPow2(0, 16);
1421 Shifts.clampScalar(0, S16, S64);
1423 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1431 Shifts.clampScalar(1, S32, S32);
1432 Shifts.widenScalarToNextPow2(0, 32);
1433 Shifts.clampScalar(0, S32, S64);
1435 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1440 Shifts.scalarize(0);
1442 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1443 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1444 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1445 unsigned IdxTypeIdx = 2;
1447 getActionDefinitionsBuilder(Op)
1449 const LLT EltTy = Query.
Types[EltTypeIdx];
1450 const LLT VecTy = Query.
Types[VecTypeIdx];
1451 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1453 return (EltSize == 32 || EltSize == 64) &&
1467 const LLT EltTy = Query.
Types[EltTypeIdx];
1468 const LLT VecTy = Query.
Types[VecTypeIdx];
1472 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1477 .clampScalar(EltTypeIdx, S32, S64)
1478 .clampScalar(VecTypeIdx, S32, S64)
1479 .clampScalar(IdxTypeIdx, S32, S32)
1480 .clampMaxNumElements(VecTypeIdx, S32, 32)
1487 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1489 const LLT &EltTy = Query.
Types[1].getElementType();
1490 return Query.
Types[0] != EltTy;
1493 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1494 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1495 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1498 getActionDefinitionsBuilder(Op)
1504 const LLT BigTy = Query.
Types[BigTyIdx];
1509 const LLT BigTy = Query.
Types[BigTyIdx];
1510 const LLT LitTy = Query.
Types[LitTyIdx];
1516 const LLT BigTy = Query.
Types[BigTyIdx];
1522 const LLT LitTy = Query.
Types[LitTyIdx];
1527 .widenScalarToNextPow2(BigTyIdx, 32);
1531 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1532 .legalForCartesianProduct(AllS32Vectors, {S32})
1533 .legalForCartesianProduct(AllS64Vectors, {S64})
1534 .clampNumElements(0, V16S32, V32S32)
1535 .clampNumElements(0, V2S64, V16S64)
1538 if (
ST.hasScalarPackInsts()) {
1541 .minScalarOrElt(0, S16)
1544 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1545 .legalFor({V2S16, S32})
1548 BuildVector.customFor({V2S16, S16});
1549 BuildVector.minScalarOrElt(0, S32);
1551 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1552 .customFor({V2S16, S32})
1559 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1561 .clampMaxNumElements(0, S32, 32)
1562 .clampMaxNumElements(1, S16, 2)
1563 .clampMaxNumElements(0, S16, 64);
1565 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1568 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1569 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1570 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1572 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1573 const LLT Ty = Query.
Types[TypeIdx];
1584 auto &
Builder = getActionDefinitionsBuilder(Op)
1586 .lowerFor({{S16, V2S16}})
1588 const LLT BigTy = Query.
Types[BigTyIdx];
1594 .widenScalarToNextPow2(LitTyIdx, 16)
1602 .clampScalar(LitTyIdx, S32, S512)
1603 .widenScalarToNextPow2(LitTyIdx, 32)
1606 [=](
const LegalityQuery &Query) {
return notValidElt(Query, LitTyIdx); },
1609 [=](
const LegalityQuery &Query) {
return notValidElt(Query, BigTyIdx); },
1611 .clampScalar(BigTyIdx, S32, MaxScalar);
1613 if (Op == G_MERGE_VALUES) {
1617 const LLT Ty = Query.
Types[LitTyIdx];
1625 const LLT Ty = Query.
Types[BigTyIdx];
1632 const LLT &Ty = Query.
Types[BigTyIdx];
1634 if (NewSizeInBits >= 256) {
1636 if (RoundedTo < NewSizeInBits)
1637 NewSizeInBits = RoundedTo;
1639 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
1648 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1649 .legalFor({{S32}, {S64}});
1651 if (
ST.hasVOP3PInsts()) {
1652 SextInReg.lowerFor({{V2S16}})
1656 .clampMaxNumElementsStrict(0, S16, 2);
1657 }
else if (
ST.has16BitInsts()) {
1658 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1662 SextInReg.lowerFor({{S32}, {S64}});
1667 .clampScalar(0, S32, S64)
1670 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1675 getActionDefinitionsBuilder(G_FSHR)
1676 .legalFor({{S32, S32}})
1677 .lowerFor({{V2S16, V2S16}})
1678 .clampMaxNumElementsStrict(0, S16, 2)
1682 if (
ST.hasVOP3PInsts()) {
1683 getActionDefinitionsBuilder(G_FSHL)
1684 .lowerFor({{V2S16, V2S16}})
1685 .clampMaxNumElementsStrict(0, S16, 2)
1689 getActionDefinitionsBuilder(G_FSHL)
1694 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1697 getActionDefinitionsBuilder(G_FENCE)
1700 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1705 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1706 .legalFor({{S32, S32}, {S64, S32}})
1707 .clampScalar(1, S32, S32)
1708 .clampScalar(0, S32, S64)
1709 .widenScalarToNextPow2(0)
1712 getActionDefinitionsBuilder({
1716 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1725 G_FMINIMUM, G_FMAXIMUM}).lower();
1727 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1730 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1731 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1732 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1735 getLegacyLegalizerInfo().computeTables();
1744 switch (
MI.getOpcode()) {
1745 case TargetOpcode::G_ADDRSPACE_CAST:
1747 case TargetOpcode::G_FRINT:
1749 case TargetOpcode::G_FCEIL:
1751 case TargetOpcode::G_FREM:
1753 case TargetOpcode::G_INTRINSIC_TRUNC:
1755 case TargetOpcode::G_SITOFP:
1757 case TargetOpcode::G_UITOFP:
1759 case TargetOpcode::G_FPTOSI:
1761 case TargetOpcode::G_FPTOUI:
1763 case TargetOpcode::G_FMINNUM:
1764 case TargetOpcode::G_FMAXNUM:
1765 case TargetOpcode::G_FMINNUM_IEEE:
1766 case TargetOpcode::G_FMAXNUM_IEEE:
1768 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1770 case TargetOpcode::G_INSERT_VECTOR_ELT:
1772 case TargetOpcode::G_FSIN:
1773 case TargetOpcode::G_FCOS:
1775 case TargetOpcode::G_GLOBAL_VALUE:
1777 case TargetOpcode::G_LOAD:
1778 case TargetOpcode::G_SEXTLOAD:
1779 case TargetOpcode::G_ZEXTLOAD:
1781 case TargetOpcode::G_FMAD:
1783 case TargetOpcode::G_FDIV:
1785 case TargetOpcode::G_UDIV:
1786 case TargetOpcode::G_UREM:
1787 case TargetOpcode::G_UDIVREM:
1789 case TargetOpcode::G_SDIV:
1790 case TargetOpcode::G_SREM:
1791 case TargetOpcode::G_SDIVREM:
1793 case TargetOpcode::G_ATOMIC_CMPXCHG:
1795 case TargetOpcode::G_FLOG:
1797 case TargetOpcode::G_FLOG10:
1799 case TargetOpcode::G_FEXP:
1801 case TargetOpcode::G_FPOW:
1803 case TargetOpcode::G_FFLOOR:
1805 case TargetOpcode::G_BUILD_VECTOR:
1806 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
1808 case TargetOpcode::G_MUL:
1810 case TargetOpcode::G_CTLZ:
1811 case TargetOpcode::G_CTTZ:
1813 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1833 if (ST.hasApertureRegs()) {
1838 ? AMDGPU::SRC_SHARED_BASE
1839 : AMDGPU::SRC_PRIVATE_BASE;
1847 Register Dst =
MRI.createGenericVirtualRegister(S64);
1848 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
1849 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {
Register(ApertureRegNo)});
1850 return B.buildUnmerge(S32, Dst).getReg(1);
1855 Register LoadAddr =
MRI.createGenericVirtualRegister(
1864 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
1866 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
1880 B.buildPtrAdd(LoadAddr, KernargPtrReg,
1883 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1886 Register QueuePtr =
MRI.createGenericVirtualRegister(
1902 B.buildPtrAdd(LoadAddr, QueuePtr,
1903 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
1904 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1912 switch (Def->getOpcode()) {
1913 case AMDGPU::G_FRAME_INDEX:
1914 case AMDGPU::G_GLOBAL_VALUE:
1915 case AMDGPU::G_BLOCK_ADDR:
1917 case AMDGPU::G_CONSTANT: {
1918 const ConstantInt *CI = Def->getOperand(1).getCImm();
1937 LLT DstTy =
MRI.getType(Dst);
1938 LLT SrcTy =
MRI.getType(Src);
1949 if (
TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1950 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
1959 B.buildExtract(Dst, Src, 0);
1960 MI.eraseFromParent();
1964 unsigned NullVal =
TM.getNullPointerValue(DestAS);
1966 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
1967 auto FlatNull =
B.buildConstant(SrcTy, 0);
1970 auto PtrLo32 =
B.buildExtract(DstTy, Src, 0);
1974 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1976 MI.eraseFromParent();
1988 Register SrcAsInt =
B.buildPtrToInt(S32, Src).getReg(0);
1992 auto BuildPtr =
B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
1995 B.buildCopy(Dst, BuildPtr);
1996 MI.eraseFromParent();
2000 auto SegmentNull =
B.buildConstant(SrcTy,
TM.getNullPointerValue(SrcAS));
2001 auto FlatNull =
B.buildConstant(DstTy,
TM.getNullPointerValue(DestAS));
2004 SegmentNull.getReg(0));
2006 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2008 MI.eraseFromParent();
2015 B.buildExtract(Dst, Src, 0);
2016 MI.eraseFromParent();
2024 auto PtrLo =
B.buildPtrToInt(S32, Src);
2025 auto HighAddr =
B.buildConstant(S32, AddrHiVal);
2026 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2027 MI.eraseFromParent();
2032 MF.
getFunction(),
"invalid addrspacecast",
B.getDebugLoc());
2035 Ctx.
diagnose(InvalidAddrSpaceCast);
2037 MI.eraseFromParent();
2045 LLT Ty =
MRI.getType(Src);
2051 auto C1 =
B.buildFConstant(Ty, C1Val);
2052 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2055 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2056 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2058 auto C2 =
B.buildFConstant(Ty, C2Val);
2059 auto Fabs =
B.buildFAbs(Ty, Src);
2062 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2063 MI.eraseFromParent();
2081 auto Trunc =
B.buildIntrinsicTrunc(S64, Src);
2083 const auto Zero =
B.buildFConstant(S64, 0.0);
2084 const auto One =
B.buildFConstant(S64, 1.0);
2087 auto And =
B.buildAnd(S1, Lt0, NeTrunc);
2088 auto Add =
B.buildSelect(S64,
And, One, Zero);
2091 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2092 MI.eraseFromParent();
2100 Register Src0Reg =
MI.getOperand(1).getReg();
2101 Register Src1Reg =
MI.getOperand(2).getReg();
2102 auto Flags =
MI.getFlags();
2103 LLT Ty =
MRI.getType(DstReg);
2105 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2106 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2107 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2108 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2109 MI.eraseFromParent();
2115 const unsigned FractBits = 52;
2116 const unsigned ExpBits = 11;
2119 auto Const0 =
B.buildConstant(S32, FractBits - 32);
2120 auto Const1 =
B.buildConstant(S32, ExpBits);
2122 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32},
false)
2124 .addUse(Const0.getReg(0))
2125 .addUse(Const1.getReg(0));
2127 return B.buildSub(S32, ExpPart,
B.buildConstant(S32, 1023));
2141 auto Unmerge =
B.buildUnmerge({S32, S32}, Src);
2148 const unsigned FractBits = 52;
2151 const auto SignBitMask =
B.buildConstant(S32, UINT32_C(1) << 31);
2152 auto SignBit =
B.buildAnd(S32,
Hi, SignBitMask);
2154 const auto FractMask =
B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2156 const auto Zero32 =
B.buildConstant(S32, 0);
2159 auto SignBit64 =
B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2161 auto Shr =
B.buildAShr(S64, FractMask, Exp);
2162 auto Not =
B.buildNot(S64, Shr);
2163 auto Tmp0 =
B.buildAnd(S64, Src, Not);
2164 auto FiftyOne =
B.buildConstant(S32, FractBits - 1);
2169 auto Tmp1 =
B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2170 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2171 MI.eraseFromParent();
2187 auto Unmerge =
B.buildUnmerge({S32, S32}, Src);
2188 auto ThirtyTwo =
B.buildConstant(S32, 32);
2190 if (
MRI.getType(Dst) == S64) {
2191 auto CvtHi =
Signed ?
B.buildSITOFP(S64, Unmerge.getReg(1))
2192 :
B.buildUITOFP(S64, Unmerge.getReg(1));
2194 auto CvtLo =
B.buildUITOFP(S64, Unmerge.getReg(0));
2195 auto LdExp =
B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64},
false)
2196 .addUse(CvtHi.getReg(0))
2197 .addUse(ThirtyTwo.getReg(0));
2200 B.buildFAdd(Dst, LdExp, CvtLo);
2201 MI.eraseFromParent();
2207 auto One =
B.buildConstant(S32, 1);
2211 auto ThirtyOne =
B.buildConstant(S32, 31);
2212 auto X =
B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2213 auto OppositeSign =
B.buildAShr(S32,
X, ThirtyOne);
2214 auto MaxShAmt =
B.buildAdd(S32, ThirtyTwo, OppositeSign);
2215 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2217 .addUse(Unmerge.getReg(1));
2218 auto LS2 =
B.buildSub(S32, LS, One);
2219 ShAmt =
B.buildUMin(S32, LS2, MaxShAmt);
2221 ShAmt =
B.buildCTLZ(S32, Unmerge.getReg(1));
2222 auto Norm =
B.buildShl(S64, Src, ShAmt);
2223 auto Unmerge2 =
B.buildUnmerge({S32, S32}, Norm);
2224 auto Adjust =
B.buildUMin(S32, One, Unmerge2.getReg(0));
2225 auto Norm2 =
B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2226 auto FVal =
Signed ?
B.buildSITOFP(S32, Norm2) :
B.buildUITOFP(S32, Norm2);
2227 auto Scale =
B.buildSub(S32, ThirtyTwo, ShAmt);
2230 .addUse(FVal.getReg(0))
2231 .addUse(Scale.getReg(0));
2232 MI.eraseFromParent();
2249 const LLT SrcLT =
MRI.getType(Src);
2250 assert((SrcLT == S32 || SrcLT == S64) &&
MRI.getType(Dst) == S64);
2252 unsigned Flags =
MI.getFlags();
2263 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2265 if (
Signed && SrcLT == S32) {
2271 Sign =
B.buildAShr(S32, Src,
B.buildConstant(S32, 31));
2272 Trunc =
B.buildFAbs(S32, Trunc, Flags);
2276 K0 =
B.buildFConstant(S64,
2278 K1 =
B.buildFConstant(S64,
2281 K0 =
B.buildFConstant(S32,
BitsToFloat(UINT32_C( 0x2f800000)));
2282 K1 =
B.buildFConstant(S32,
BitsToFloat(UINT32_C( 0xcf800000)));
2285 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2286 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2287 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2289 auto Hi = (
Signed && SrcLT == S64) ?
B.buildFPTOSI(S32, FloorMul)
2290 :
B.buildFPTOUI(S32, FloorMul);
2291 auto Lo =
B.buildFPTOUI(S32, Fma);
2293 if (
Signed && SrcLT == S32) {
2295 Sign =
B.buildMergeLikeInstr(S64, {Sign, Sign});
2297 B.buildSub(Dst,
B.buildXor(S64,
B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2300 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2301 MI.eraseFromParent();
2311 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2312 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2335 std::optional<ValueAndVReg> MaybeIdxVal =
2339 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2344 LLT VecTy =
MRI.getType(Vec);
2349 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2350 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2355 MI.eraseFromParent();
2369 std::optional<ValueAndVReg> MaybeIdxVal =
2374 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2379 LLT VecTy =
MRI.getType(Vec);
2385 if (IdxVal < NumElts) {
2387 for (
unsigned i = 0; i < NumElts; ++i)
2388 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2389 B.buildUnmerge(SrcRegs, Vec);
2391 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2392 B.buildMergeLikeInstr(Dst, SrcRegs);
2397 MI.eraseFromParent();
2407 LLT Ty =
MRI.getType(DstReg);
2408 unsigned Flags =
MI.getFlags();
2413 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2414 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty},
false)
2415 .addUse(MulVal.getReg(0))
2416 .setMIFlags(Flags).getReg(0);
2418 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2421 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2425 MI.eraseFromParent();
2433 unsigned GAFlags)
const {
2434 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
2471 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2482 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2485 B.buildExtract(DstReg, PCReg, 0);
2493 LLT Ty =
MRI.getType(DstReg);
2505 Fn,
"local memory global used by non-kernel function",
MI.getDebugLoc(),
2515 B.buildUndef(DstReg);
2516 MI.eraseFromParent();
2536 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2541 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32},
false);
2542 B.buildIntToPtr(DstReg, Sz);
2543 MI.eraseFromParent();
2549 *cast<GlobalVariable>(GV)));
2550 MI.eraseFromParent();
2558 MI.eraseFromParent();
2564 MI.eraseFromParent();
2569 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
2582 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2583 B.buildExtract(DstReg, Load, 0);
2585 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2587 MI.eraseFromParent();
2605 LLT PtrTy =
MRI.getType(PtrReg);
2610 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
2612 MI.getOperand(1).setReg(Cast.getReg(0));
2617 if (
MI.getOpcode() != AMDGPU::G_LOAD)
2621 LLT ValTy =
MRI.getType(ValReg);
2636 if (WideMemSize == ValSize) {
2642 MI.setMemRefs(MF, {WideMMO});
2648 if (ValSize > WideMemSize)
2655 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2656 B.buildTrunc(ValReg, WideLoad).getReg(0);
2663 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2664 B.buildExtract(ValReg, WideLoad, 0);
2668 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2669 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2673 MI.eraseFromParent();
2683 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
2710 "this should not have been custom lowered");
2712 LLT ValTy =
MRI.getType(CmpVal);
2715 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
2717 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2721 .setMemRefs(
MI.memoperands());
2723 MI.eraseFromParent();
2731 LLT Ty =
B.getMRI()->getType(Dst);
2732 unsigned Flags =
MI.getFlags();
2734 auto Log2Operand =
B.buildFLog2(Ty, Src, Flags);
2735 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
2737 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2738 MI.eraseFromParent();
2746 unsigned Flags =
MI.getFlags();
2747 LLT Ty =
B.getMRI()->getType(Dst);
2750 auto Mul =
B.buildFMul(Ty, Src, K, Flags);
2751 B.buildFExp2(Dst,
Mul, Flags);
2752 MI.eraseFromParent();
2761 unsigned Flags =
MI.getFlags();
2762 LLT Ty =
B.getMRI()->getType(Dst);
2767 auto Log =
B.buildFLog2(S32, Src0, Flags);
2768 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32},
false)
2769 .addUse(Log.getReg(0))
2772 B.buildFExp2(Dst,
Mul, Flags);
2773 }
else if (Ty == S16) {
2775 auto Log =
B.buildFLog2(S16, Src0, Flags);
2776 auto Ext0 =
B.buildFPExt(S32, Log, Flags);
2777 auto Ext1 =
B.buildFPExt(S32, Src1, Flags);
2778 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32},
false)
2779 .addUse(Ext0.getReg(0))
2780 .addUse(Ext1.getReg(0))
2783 B.buildFExp2(Dst,
B.buildFPTrunc(S16,
Mul), Flags);
2787 MI.eraseFromParent();
2795 ModSrc = SrcFNeg->getOperand(1).getReg();
2797 ModSrc = SrcFAbs->getOperand(1).getReg();
2799 ModSrc = SrcFAbs->getOperand(1).getReg();
2810 Register OrigSrc =
MI.getOperand(1).getReg();
2811 unsigned Flags =
MI.getFlags();
2813 "this should not have been custom lowered");
2823 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64},
false)
2834 auto Const =
B.buildFConstant(S64,
BitsToDouble(0x3fefffffffffffff));
2836 Register Min =
MRI.createGenericVirtualRegister(S64);
2842 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2844 B.buildFMinNum(Min, Fract, Const, Flags);
2849 CorrectedFract =
B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2852 auto NegFract =
B.buildFNeg(S64, CorrectedFract, Flags);
2853 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2855 MI.eraseFromParent();
2871 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
2873 Src0 =
B.buildTrunc(S16,
MI.getOperand(1).getReg()).getReg(0);
2874 Src1 =
B.buildTrunc(S16,
MI.getOperand(2).getReg()).getReg(0);
2877 auto Merge =
B.buildMergeLikeInstr(S32, {Src0, Src1});
2878 B.buildBitcast(Dst,
Merge);
2880 MI.eraseFromParent();
2896 bool UsePartialMad64_32,
bool SeparateOddAlignedProducts)
const {
2910 auto getZero32 = [&]() ->
Register {
2912 Zero32 =
B.buildConstant(S32, 0).getReg(0);
2915 auto getZero64 = [&]() ->
Register {
2917 Zero64 =
B.buildConstant(S64, 0).getReg(0);
2927 if (CarryIn.empty())
2930 bool HaveCarryOut =
true;
2932 if (CarryIn.size() == 1) {
2934 LocalAccum =
B.buildZExt(S32, CarryIn[0]).getReg(0);
2938 CarryAccum = getZero32();
2940 CarryAccum =
B.buildZExt(S32, CarryIn[0]).getReg(0);
2941 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
2943 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
2948 LocalAccum = getZero32();
2949 HaveCarryOut =
false;
2954 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
2955 LocalAccum =
Add.getReg(0);
2969 auto buildMadChain =
2972 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
2973 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
2980 if (LocalAccum.size() == 1 &&
2981 (!UsePartialMad64_32 || !CarryIn.empty())) {
2983 unsigned j1 = DstIndex - j0;
2984 auto Mul =
B.buildMul(S32, Src0[j0], Src1[j1]);
2985 if (!LocalAccum[0]) {
2986 LocalAccum[0] =
Mul.getReg(0);
2988 if (CarryIn.empty()) {
2989 LocalAccum[0] =
B.buildAdd(S32, LocalAccum[0],
Mul).getReg(0);
2992 B.buildUAdde(S32, S1, LocalAccum[0],
Mul, CarryIn.back())
2998 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3002 if (j0 <= DstIndex) {
3003 bool HaveSmallAccum =
false;
3006 if (LocalAccum[0]) {
3007 if (LocalAccum.size() == 1) {
3008 Tmp =
B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3009 HaveSmallAccum =
true;
3010 }
else if (LocalAccum[1]) {
3011 Tmp =
B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3012 HaveSmallAccum =
false;
3014 Tmp =
B.buildZExt(S64, LocalAccum[0]).getReg(0);
3015 HaveSmallAccum =
true;
3018 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3020 HaveSmallAccum =
true;
3024 unsigned j1 = DstIndex - j0;
3025 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3026 {Src0[j0], Src1[j1], Tmp});
3027 Tmp = Mad.getReg(0);
3028 if (!HaveSmallAccum)
3029 CarryOut.push_back(Mad.getReg(1));
3030 HaveSmallAccum =
false;
3032 }
while (j0 <= DstIndex);
3034 auto Unmerge =
B.buildUnmerge(S32, Tmp);
3035 LocalAccum[0] = Unmerge.getReg(0);
3036 if (LocalAccum.size() > 1)
3037 LocalAccum[1] = Unmerge.getReg(1);
3064 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
3065 Carry OddCarryIn = std::move(OddCarry);
3066 Carry EvenCarryIn = std::move(EvenCarry);
3071 if (2 * i < Accum.
size()) {
3072 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
3073 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3078 if (!SeparateOddAlignedProducts) {
3079 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
3080 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3082 bool IsHighest = 2 * i >= Accum.
size();
3086 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3092 Lo =
B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3094 Lo =
B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3096 Lo =
B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3099 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
3102 auto Hi =
B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3103 Lo->getOperand(1).getReg());
3104 Accum[2 * i] =
Hi.getReg(0);
3105 SeparateOddCarry =
Hi.getReg(1);
3112 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3113 EvenCarryIn.push_back(CarryOut);
3115 if (2 * i < Accum.
size()) {
3116 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3117 OddCarry.push_back(CarryOut);
3130 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
3139 LLT Ty =
MRI.getType(DstReg);
3143 unsigned NumParts =
Size / 32;
3159 for (
unsigned i = 0; i < NumParts; ++i) {
3160 Src0Parts.
push_back(
MRI.createGenericVirtualRegister(S32));
3161 Src1Parts.
push_back(
MRI.createGenericVirtualRegister(S32));
3163 B.buildUnmerge(Src0Parts, Src0);
3164 B.buildUnmerge(Src1Parts, Src1);
3167 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3168 SeparateOddAlignedProducts);
3170 B.buildMergeLikeInstr(DstReg, AccumRegs);
3171 MI.eraseFromParent();
3184 LLT DstTy =
MRI.getType(Dst);
3185 LLT SrcTy =
MRI.getType(Src);
3187 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
3188 ? AMDGPU::G_AMDGPU_FFBH_U32
3189 : AMDGPU::G_AMDGPU_FFBL_B32;
3190 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
3193 MI.eraseFromParent();
3199 if (
MI.getOpcode() != TargetOpcode::G_XOR)
3202 return ConstVal && *ConstVal == -1;
3209 Register CondDef =
MI.getOperand(0).getReg();
3210 if (!
MRI.hasOneNonDBGUse(CondDef))
3218 if (!
MRI.hasOneNonDBGUse(NegatedCond))
3224 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
3233 if (Next == Parent->
end()) {
3237 UncondBrTarget = &*NextMBB;
3239 if (Next->getOpcode() != AMDGPU::G_BR)
3257 *ArgRC,
B.getDebugLoc(), ArgTy);
3258 if (
Arg->isMasked()) {
3261 const unsigned Mask =
Arg->getMask();
3262 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
3269 auto ShiftAmt =
B.buildConstant(S32, Shift);
3270 AndMaskSrc =
B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3273 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(S32, Mask >> Shift));
3275 B.buildCopy(DstReg, LiveIn);
3294 B.buildConstant(DstReg, 0);
3300 B.buildUndef(DstReg);
3304 if (!
Arg->isRegister() || !
Arg->getRegister().isValid())
3315 MI.eraseFromParent();
3321 B.buildConstant(
MI.getOperand(0).getReg(),
C);
3322 MI.eraseFromParent();
3343 B.buildUndef(DstReg);
3344 MI.eraseFromParent();
3348 if (
Arg->isMasked()) {
3362 MI.eraseFromParent();
3369 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
3379 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3387 Align Alignment)
const {
3391 "unexpected kernarg parameter type");
3395 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
3398 MI.eraseFromParent();
3406 LLT DstTy =
MRI.getType(Dst);
3433 auto FloatY =
B.buildUITOFP(S32,
Y);
3434 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3435 auto Scale =
B.buildFConstant(S32,
BitsToFloat(0x4f7ffffe));
3436 auto ScaledY =
B.buildFMul(S32, RcpIFlag, Scale);
3437 auto Z =
B.buildFPTOUI(S32, ScaledY);
3440 auto NegY =
B.buildSub(S32,
B.buildConstant(S32, 0),
Y);
3441 auto NegYZ =
B.buildMul(S32, NegY, Z);
3442 Z =
B.buildAdd(S32, Z,
B.buildUMulH(S32, Z, NegYZ));
3445 auto Q =
B.buildUMulH(S32,
X, Z);
3446 auto R =
B.buildSub(S32,
X,
B.buildMul(S32, Q,
Y));
3449 auto One =
B.buildConstant(S32, 1);
3452 Q =
B.buildSelect(S32,
Cond,
B.buildAdd(S32, Q, One), Q);
3453 R =
B.buildSelect(S32,
Cond,
B.buildSub(S32, R,
Y), R);
3458 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(S32, Q, One), Q);
3461 B.buildSelect(DstRemReg,
Cond,
B.buildSub(S32, R,
Y), R);
3480 auto Unmerge =
B.buildUnmerge(S32, Val);
3482 auto CvtLo =
B.buildUITOFP(S32, Unmerge.getReg(0));
3483 auto CvtHi =
B.buildUITOFP(S32, Unmerge.getReg(1));
3485 auto Mad =
B.buildFMAD(S32, CvtHi,
3486 B.buildFConstant(S32,
BitsToFloat(0x4f800000)), CvtLo);
3488 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3490 B.buildFMul(S32, Rcp,
B.buildFConstant(S32,
BitsToFloat(0x5f7ffffc)));
3494 B.buildFMul(S32, Mul1,
B.buildFConstant(S32,
BitsToFloat(0x2f800000)));
3495 auto Trunc =
B.buildIntrinsicTrunc(S32, Mul2);
3498 auto Mad2 =
B.buildFMAD(S32, Trunc,
3501 auto ResultLo =
B.buildFPTOUI(S32, Mad2);
3502 auto ResultHi =
B.buildFPTOUI(S32, Trunc);
3504 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3519 auto Rcp =
B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
3521 auto Zero64 =
B.buildConstant(S64, 0);
3522 auto NegDenom =
B.buildSub(S64, Zero64, Denom);
3524 auto MulLo1 =
B.buildMul(S64, NegDenom, Rcp);
3525 auto MulHi1 =
B.buildUMulH(S64, Rcp, MulLo1);
3527 auto UnmergeMulHi1 =
B.buildUnmerge(S32, MulHi1);
3528 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3529 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3531 auto Add1_Lo =
B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3532 auto Add1_Hi =
B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3533 auto Add1 =
B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
3535 auto MulLo2 =
B.buildMul(S64, NegDenom, Add1);
3536 auto MulHi2 =
B.buildUMulH(S64, Add1, MulLo2);
3537 auto UnmergeMulHi2 =
B.buildUnmerge(S32, MulHi2);
3538 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3539 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3541 auto Zero32 =
B.buildConstant(S32, 0);
3542 auto Add2_Lo =
B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3543 auto Add2_Hi =
B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3544 auto Add2 =
B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
3546 auto UnmergeNumer =
B.buildUnmerge(S32, Numer);
3547 Register NumerLo = UnmergeNumer.getReg(0);
3548 Register NumerHi = UnmergeNumer.getReg(1);
3550 auto MulHi3 =
B.buildUMulH(S64, Numer, Add2);
3551 auto Mul3 =
B.buildMul(S64, Denom, MulHi3);
3552 auto UnmergeMul3 =
B.buildUnmerge(S32, Mul3);
3553 Register Mul3_Lo = UnmergeMul3.getReg(0);
3554 Register Mul3_Hi = UnmergeMul3.getReg(1);
3555 auto Sub1_Lo =
B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3556 auto Sub1_Hi =
B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3557 auto Sub1_Mi =
B.buildSub(S32, NumerHi, Mul3_Hi);
3558 auto Sub1 =
B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
3560 auto UnmergeDenom =
B.buildUnmerge(S32, Denom);
3561 Register DenomLo = UnmergeDenom.getReg(0);
3562 Register DenomHi = UnmergeDenom.getReg(1);
3565 auto C1 =
B.buildSExt(S32, CmpHi);
3568 auto C2 =
B.buildSExt(S32, CmpLo);
3571 auto C3 =
B.buildSelect(S32, CmpEq, C2, C1);
3578 auto Sub2_Lo =
B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3579 auto Sub2_Mi =
B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3580 auto Sub2_Hi =
B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3581 auto Sub2 =
B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
3583 auto One64 =
B.buildConstant(S64, 1);
3584 auto Add3 =
B.buildAdd(S64, MulHi3, One64);
3590 auto C6 =
B.buildSelect(
3594 auto Add4 =
B.buildAdd(S64, Add3, One64);
3595 auto Sub3_Lo =
B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3597 auto Sub3_Mi =
B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3598 auto Sub3_Hi =
B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3599 auto Sub3 =
B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
3605 auto Sel1 =
B.buildSelect(
3612 auto Sel2 =
B.buildSelect(
3623 switch (
MI.getOpcode()) {
3626 case AMDGPU::G_UDIV: {
3627 DstDivReg =
MI.getOperand(0).getReg();
3630 case AMDGPU::G_UREM: {
3631 DstRemReg =
MI.getOperand(0).getReg();
3634 case AMDGPU::G_UDIVREM: {
3635 DstDivReg =
MI.getOperand(0).getReg();
3636 DstRemReg =
MI.getOperand(1).getReg();
3643 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
3644 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
3645 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
3646 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3655 MI.eraseFromParent();
3665 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3666 if (Ty != S32 && Ty != S64)
3669 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
3673 auto SignBitOffset =
B.buildConstant(S32, Ty.
getSizeInBits() - 1);
3674 auto LHSign =
B.buildAShr(Ty,
LHS, SignBitOffset);
3675 auto RHSign =
B.buildAShr(Ty,
RHS, SignBitOffset);
3677 LHS =
B.buildAdd(Ty,
LHS, LHSign).getReg(0);
3678 RHS =
B.buildAdd(Ty,
RHS, RHSign).getReg(0);
3680 LHS =
B.buildXor(Ty,
LHS, LHSign).getReg(0);
3681 RHS =
B.buildXor(Ty,
RHS, RHSign).getReg(0);
3683 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3684 switch (
MI.getOpcode()) {
3687 case AMDGPU::G_SDIV: {
3688 DstDivReg =
MI.getOperand(0).getReg();
3689 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
3692 case AMDGPU::G_SREM: {
3693 DstRemReg =
MI.getOperand(0).getReg();
3694 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
3697 case AMDGPU::G_SDIVREM: {
3698 DstDivReg =
MI.getOperand(0).getReg();
3699 DstRemReg =
MI.getOperand(1).getReg();
3700 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
3701 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
3712 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
3713 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3714 B.buildSub(DstDivReg, SignXor, Sign);
3718 auto Sign = LHSign.getReg(0);
3719 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3720 B.buildSub(DstRemReg, SignXor, Sign);
3723 MI.eraseFromParent();
3734 LLT ResTy =
MRI.getType(Res);
3740 if (!AllowInaccurateRcp)
3745 if (CLHS->isExactlyValue(1.0)) {
3746 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res,
false)
3750 MI.eraseFromParent();
3755 if (CLHS->isExactlyValue(-1.0)) {
3756 auto FNeg =
B.buildFNeg(ResTy,
RHS, Flags);
3757 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res,
false)
3758 .addUse(FNeg.getReg(0))
3761 MI.eraseFromParent();
3767 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy},
false)
3770 B.buildFMul(Res,
LHS, RCP, Flags);
3772 MI.eraseFromParent();
3783 LLT ResTy =
MRI.getType(Res);
3789 if (!AllowInaccurateRcp)
3792 auto NegY =
B.buildFNeg(ResTy,
Y);
3793 auto One =
B.buildFConstant(ResTy, 1.0);
3795 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy},
false)
3799 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
3800 R =
B.buildFMA(ResTy, Tmp0, R, R);
3802 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
3803 R =
B.buildFMA(ResTy, Tmp1, R, R);
3805 auto Ret =
B.buildFMul(ResTy,
X, R);
3806 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
3808 B.buildFMA(Res, Tmp2, R, Ret);
3809 MI.eraseFromParent();
3828 auto LHSExt =
B.buildFPExt(S32,
LHS, Flags);
3829 auto RHSExt =
B.buildFPExt(S32,
RHS, Flags);
3831 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32},
false)
3832 .addUse(RHSExt.getReg(0))
3835 auto QUOT =
B.buildFMul(S32, LHSExt, RCP, Flags);
3836 auto RDst =
B.buildFPTrunc(S16, QUOT, Flags);
3838 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res,
false)
3839 .addUse(RDst.getReg(0))
3844 MI.eraseFromParent();
3855 unsigned SPDenormMode =
3858 if (ST.hasDenormModeInst()) {
3860 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3862 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3863 B.buildInstr(AMDGPU::S_DENORM_MODE)
3864 .addImm(NewDenormModeValue);
3872 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3873 .addImm(SPDenormMode)
3874 .addImm(SPDenormModeBitField);
3895 auto One =
B.buildFConstant(S32, 1.0f);
3897 auto DenominatorScaled =
3898 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1},
false)
3903 auto NumeratorScaled =
3904 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1},
false)
3910 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32},
false)
3911 .addUse(DenominatorScaled.getReg(0))
3913 auto NegDivScale0 =
B.buildFNeg(S32, DenominatorScaled, Flags);
3917 if (!Mode.allFP32Denormals())
3920 auto Fma0 =
B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3921 auto Fma1 =
B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3922 auto Mul =
B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3923 auto Fma2 =
B.buildFMA(S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
3924 auto Fma3 =
B.buildFMA(S32, Fma2, Fma1,
Mul, Flags);
3925 auto Fma4 =
B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3927 if (!Mode.allFP32Denormals())
3930 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32},
false)
3931 .addUse(Fma4.getReg(0))
3932 .addUse(Fma1.getReg(0))
3933 .addUse(Fma3.getReg(0))
3934 .addUse(NumeratorScaled.getReg(1))
3937 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res,
false)
3938 .addUse(Fmas.getReg(0))
3943 MI.eraseFromParent();
3962 auto One =
B.buildFConstant(S64, 1.0);
3964 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1},
false)
3970 auto NegDivScale0 =
B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3972 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64},
false)
3973 .addUse(DivScale0.getReg(0))
3976 auto Fma0 =
B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3977 auto Fma1 =
B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3978 auto Fma2 =
B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3980 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1},
false)
3986 auto Fma3 =
B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3987 auto Mul =
B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3988 auto Fma4 =
B.buildFMA(S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
3997 auto NumUnmerge =
B.buildUnmerge(S32,
LHS);
3998 auto DenUnmerge =
B.buildUnmerge(S32,
RHS);
3999 auto Scale0Unmerge =
B.buildUnmerge(S32, DivScale0);
4000 auto Scale1Unmerge =
B.buildUnmerge(S32, DivScale1);
4003 Scale1Unmerge.getReg(1));
4005 Scale0Unmerge.getReg(1));
4006 Scale =
B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4008 Scale = DivScale1.getReg(1);
4011 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64},
false)
4012 .addUse(Fma4.getReg(0))
4013 .addUse(Fma3.getReg(0))
4014 .addUse(
Mul.getReg(0))
4018 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res),
false)
4019 .addUse(Fmas.getReg(0))
4024 MI.eraseFromParent();
4039 auto Abs =
B.buildFAbs(S32,
RHS, Flags);
4042 auto C0 =
B.buildConstant(S32, 0x6f800000);
4043 auto C1 =
B.buildConstant(S32, 0x2f800000);
4047 auto Sel =
B.buildSelect(S32, CmpRes, C1, C2, Flags);
4049 auto Mul0 =
B.buildFMul(S32,
RHS, Sel, Flags);
4051 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32},
false)
4052 .addUse(Mul0.getReg(0))
4055 auto Mul1 =
B.buildFMul(S32,
LHS, RCP, Flags);
4057 B.buildFMul(Res, Sel, Mul1, Flags);
4059 MI.eraseFromParent();
4077 auto Flags =
MI.getFlags();
4079 LLT Ty =
MRI.getType(Dst);
4089 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty},
false)
4099 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4100 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4105 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4107 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4108 MI.eraseFromParent();
4114 case Intrinsic::amdgcn_ds_fadd:
4115 return AMDGPU::G_ATOMICRMW_FADD;
4116 case Intrinsic::amdgcn_ds_fmin:
4117 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4118 case Intrinsic::amdgcn_ds_fmax:
4119 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4135 for (
int I = 6;
I > 3; --
I)
4136 MI.removeOperand(
I);
4138 MI.removeOperand(1);
4149 LLT DstTy =
MRI.getType(DstReg);
4152 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
4158 B.buildPtrAdd(DstReg, KernargPtrReg,
B.buildConstant(IdxTy,
Offset).getReg(0));
4175 MI.eraseFromParent();
4183 std::optional<uint32_t> KnownSize =
4185 if (KnownSize.has_value())
4186 B.buildConstant(DstReg, *KnownSize);
4204 MI.eraseFromParent();
4211 unsigned AddrSpace)
const {
4213 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32),
MI.getOperand(2).getReg());
4217 MI.eraseFromParent();
4227std::pair<Register, unsigned>
4230 const unsigned MaxImm = 4095;
4236 std::tie(BaseReg, ImmOffset) =
4240 if (
MRI.getType(BaseReg).isPointer())
4241 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
4250 unsigned Overflow = ImmOffset & ~MaxImm;
4251 ImmOffset -= Overflow;
4252 if ((int32_t)Overflow < 0) {
4253 Overflow += ImmOffset;
4257 if (Overflow != 0) {
4259 BaseReg =
B.buildConstant(S32, Overflow).getReg(0);
4261 auto OverflowVal =
B.buildConstant(S32, Overflow);
4262 BaseReg =
B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4267 BaseReg =
B.buildConstant(S32, 0).getReg(0);
4269 return std::pair(BaseReg, ImmOffset);
4275 unsigned ImmOffset,
Register VIndex,
4277 std::optional<ValueAndVReg> MaybeVOffsetVal =
4279 std::optional<ValueAndVReg> MaybeSOffsetVal =
4281 std::optional<ValueAndVReg> MaybeVIndexVal =
4286 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4287 MaybeVIndexVal->Value == 0) {
4288 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4289 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4301 bool ImageStore)
const {
4304 LLT StoreVT =
MRI.getType(Reg);
4308 auto Unmerge =
B.buildUnmerge(S16, Reg);
4311 for (
int I = 0,
E = Unmerge->getNumOperands() - 1;
I !=
E; ++
I)
4312 WideRegs.
push_back(
B.buildAnyExt(S32, Unmerge.getReg(
I)).getReg(0));
4323 Reg =
B.buildBitcast(S32, Reg).getReg(0);
4325 PackedRegs.
resize(2,
B.buildUndef(S32).getReg(0));
4332 auto Unmerge =
B.buildUnmerge(S16, Reg);
4333 for (
int I = 0,
E = Unmerge->getNumOperands() - 1;
I !=
E; ++
I)
4335 PackedRegs.
resize(6,
B.buildUndef(S16).getReg(0));
4343 auto Unmerge =
B.buildUnmerge(S32, Reg);
4344 for (
int I = 0,
E = Unmerge->getNumOperands() - 1;
I !=
E; ++
I)
4346 PackedRegs.
resize(4,
B.buildUndef(S32).getReg(0));
4364 LLT Ty =
MRI->getType(VData);
4388 bool IsFormat)
const {
4390 LLT Ty =
MRI.getType(VData);
4392 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
4399 const int MemSize = MMO->
getSize();
4404 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4407 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
4411 VIndex =
MI.getOperand(3).getReg();
4414 VIndex =
B.buildConstant(S32, 0).getReg(0);
4417 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
4418 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
4422 Format =
MI.getOperand(5 + OpOffset).getImm();
4426 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
4433 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4434 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4435 }
else if (IsFormat) {
4436 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4437 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4441 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4444 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4447 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4452 auto MIB =
B.buildInstr(Opc)
4463 MIB.addImm(AuxiliaryData)
4464 .addImm(HasVIndex ? -1 : 0)
4465 .addMemOperand(MMO);
4467 MI.eraseFromParent();
4473 unsigned ImmOffset,
unsigned Format,
4476 auto MIB =
B.buildInstr(Opc)
4487 MIB.addImm(AuxiliaryData)
4488 .addImm(HasVIndex ? -1 : 0)
4489 .addMemOperand(MMO);
4496 bool IsTyped)
const {
4506 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
4507 bool IsTFE =
MI.getNumExplicitDefs() == 2;
4509 StatusDst =
MI.getOperand(1).getReg();
4513 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
4516 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4519 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
4522 VIndex =
MI.getOperand(3 + OpOffset).getReg();
4525 VIndex =
B.buildConstant(S32, 0).getReg(0);
4528 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
4529 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
4533 Format =
MI.getOperand(5 + OpOffset).getImm();
4537 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
4540 LLT Ty =
MRI.getType(Dst);
4542 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
4554 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4555 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4556 }
else if (IsFormat) {
4560 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
4562 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
4563 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4570 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4573 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4576 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4583 unsigned NumLoadDWords = NumValueDWords + 1;
4585 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
4586 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4587 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
4588 if (NumValueDWords == 1) {
4589 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
4592 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
4593 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(S32));
4595 B.buildUnmerge(LoadElts, LoadDstReg);
4597 B.buildMergeLikeInstr(Dst, LoadElts);
4601 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(S32);
4602 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4603 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
4604 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
4605 B.buildTrunc(Dst, LoadDstReg);
4606 }
else if (Unpacked && IsD16 && Ty.
isVector()) {
4608 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4609 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4610 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
4611 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
4613 auto Unmerge =
B.buildUnmerge(S32, LoadDstReg);
4615 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
4616 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
4617 B.buildMergeLikeInstr(Dst, Repack);
4620 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
4623 MI.eraseFromParent();
4630 unsigned Opc = IsInc ? AMDGPU::G_ATOMICRMW_UINC_WRAP :
4631 AMDGPU::G_ATOMICRMW_UDEC_WRAP;
4633 .addDef(
MI.getOperand(0).getReg())
4634 .addUse(
MI.getOperand(2).getReg())
4635 .addUse(
MI.getOperand(3).getReg())
4637 MI.eraseFromParent();
4643 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4644 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4645 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4646 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4647 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4648 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4649 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4650 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4651 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4652 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4653 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4654 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4655 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4656 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4657 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4658 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4659 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4660 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4661 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4662 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4663 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4664 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4665 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4666 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4667 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4668 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4669 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4670 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4671 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4672 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4673 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4674 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4675 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4676 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4677 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4678 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4679 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4680 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4681 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4682 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4683 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4684 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4685 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4686 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4687 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4688 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4689 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4690 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4699 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4700 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4701 const bool HasReturn =
MI.getNumExplicitDefs() != 0;
4708 Dst =
MI.getOperand(0).getReg();
4713 Register VData =
MI.getOperand(2 + OpOffset).getReg();
4717 CmpVal =
MI.getOperand(3 + OpOffset).getReg();
4721 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
4722 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4725 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
4728 VIndex =
MI.getOperand(4 + OpOffset).getReg();
4734 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
4735 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
4736 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
4759 .addImm(AuxiliaryData)
4760 .addImm(HasVIndex ? -1 : 0)
4761 .addMemOperand(MMO);
4763 MI.eraseFromParent();
4773 bool IsA16,
bool IsG16) {
4776 auto EndIdx =
Intr->VAddrEnd;
4778 for (
unsigned I =
Intr->VAddrStart;
I < EndIdx;
I++) {
4785 if ((I < Intr->GradientStart) ||
4786 (
I >=
Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4787 (
I >=
Intr->CoordStart && !IsA16)) {
4788 if ((I < Intr->GradientStart) && IsA16 &&
4789 (
B.getMRI()->getType(AddrReg) == S16)) {
4790 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
4794 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4798 "Bias needs to be converted to 16 bit in A16 mode");
4800 AddrReg =
B.buildBitcast(V2S16, AddrReg).getReg(0);
4806 if (((
I + 1) >= EndIdx) ||
4807 ((
Intr->NumGradients / 2) % 2 == 1 &&
4808 (
I ==
static_cast<unsigned>(
Intr->GradientStart +
4809 (
Intr->NumGradients / 2) - 1) ||
4810 I ==
static_cast<unsigned>(
Intr->GradientStart +
4811 Intr->NumGradients - 1))) ||
4813 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
4815 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4820 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4831 int DimIdx,
int NumVAddrs) {
4835 for (
int I = 0;
I != NumVAddrs; ++
I) {
4837 if (
SrcOp.isReg()) {
4843 int NumAddrRegs = AddrRegs.
size();
4844 if (NumAddrRegs != 1) {
4847 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4850 for (
int I = 1;
I != NumVAddrs; ++
I) {
4853 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
4875 const unsigned NumDefs =
MI.getNumExplicitDefs();
4876 const unsigned ArgOffset = NumDefs + 1;
4877 bool IsTFE = NumDefs == 2;
4891 Register VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4892 LLT Ty =
MRI->getType(VData);
4896 MRI->getType(
MI.getOperand(ArgOffset +
Intr->GradientStart).getReg());
4898 MRI->getType(
MI.getOperand(ArgOffset +
Intr->CoordStart).getReg());
4899 const bool IsG16 = GradTy == S16;
4900 const bool IsA16 = AddrTy == S16;
4904 if (!BaseOpcode->
Atomic) {
4905 DMask =
MI.getOperand(ArgOffset +
Intr->DMaskIndex).getImm();
4908 }
else if (DMask != 0) {
4910 }
else if (!IsTFE && !BaseOpcode->
Store) {
4912 B.buildUndef(
MI.getOperand(0));
4913 MI.eraseFromParent();
4921 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4922 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4923 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4924 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4925 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4928 MI.setDesc(
B.getTII().get(NewOpcode));
4932 if (IsTFE && DMask == 0) {