28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/IR/IntrinsicsR600.h"
31 #define DEBUG_TYPE "amdgpu-legalinfo"
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
41 "amdgpu-global-isel-new-legality",
42 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
43 "rules compatible with selection patterns"),
68 const LLT Ty = Query.Types[TypeIdx];
75 EltSize > 1 && EltSize < 32 &&
82 const LLT Ty = Query.Types[TypeIdx];
89 const LLT Ty = Query.Types[TypeIdx];
97 const LLT Ty = Query.Types[TypeIdx];
99 return std::make_pair(TypeIdx,
106 const LLT Ty = Query.Types[TypeIdx];
109 unsigned Pieces = (Size + 63) / 64;
111 return std::make_pair(
121 const LLT Ty = Query.Types[TypeIdx];
126 const int NextMul32 = (Size + 31) / 32;
130 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
149 const LLT Ty = Query.Types[TypeIdx];
156 const LLT Ty = Query.Types[TypeIdx];
159 return std::make_pair(
166 const LLT QueryTy = Query.Types[TypeIdx];
173 const LLT QueryTy = Query.Types[TypeIdx];
180 const LLT QueryTy = Query.Types[TypeIdx];
191 return EltSize == 16 || EltSize % 32 == 0;
196 return EltSize == 32 || EltSize == 64 ||
198 EltSize == 128 || EltSize == 256;
221 const LLT QueryTy = Query.Types[TypeIdx];
233 const LLT Ty = Query.Types[TypeIdx];
235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.
getSizeInBits();
247 return ST.enableFlatScratch() ? 128 : 32;
249 return ST.useDS128() ? 128 : 64;
259 return IsLoad ? 512 : 128;
272 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
277 unsigned AS = Query.
Types[1].getAddressSpace();
291 if (IsLoad && MemSize < Size)
310 if (!
ST.hasDwordx3LoadStores())
323 if (AlignBits < MemSize) {
326 Align(AlignBits / 8)))
352 return EltSize != 32 && EltSize != 64;
367 if (Size != MemSizeInBits)
383 uint64_t AlignInBits,
unsigned AddrSpace,
393 if (SizeInBits == 96 &&
ST.hasDwordx3LoadStores())
404 if (AlignInBits < RoundedSize)
411 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
423 Query.
Types[1].getAddressSpace(), Opcode);
429 using namespace TargetOpcode;
431 auto GetAddrSpacePtr = [&
TM](
unsigned AS) {
475 std::initializer_list<LLT> AllS32Vectors =
476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478 std::initializer_list<LLT> AllS64Vectors =
479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
489 const LLT CodePtr = FlatPtr;
491 const std::initializer_list<LLT> AddrSpaces64 = {
492 GlobalPtr, ConstantPtr, FlatPtr
495 const std::initializer_list<LLT> AddrSpaces32 = {
496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
499 const std::initializer_list<LLT> FPTypesBase = {
503 const std::initializer_list<LLT> FPTypes16 = {
507 const std::initializer_list<LLT> FPTypesPK16 = {
511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
519 .
legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520 .legalFor(AllS32Vectors)
531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
534 .legalFor({S32, S16, V2S16})
535 .clampMaxNumElementsStrict(0, S16, 2)
543 .clampMaxNumElementsStrict(0, S16, 2)
551 .legalFor({S32, S16, V2S16})
552 .minScalarOrElt(0, S16)
557 }
else if (ST.has16BitInsts()) {
559 .legalFor({S32, S16})
569 .widenScalarToNextMultipleOf(0, 32)
576 .legalFor({S32, S16})
591 .widenScalarToNextMultipleOf(0, 32)
599 .widenScalarToNextMultipleOf(0, 32);
601 if (ST.hasMad64_32())
604 Mul.maxScalar(0, S32);
606 if (ST.hasIntClamp()) {
610 .minScalarOrElt(0, S32)
629 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
630 .customFor({S32, S64})
631 .clampScalar(0, S32, S64)
639 if (ST.hasVOP3PInsts()) {
641 .clampMaxNumElements(0, S8, 2)
652 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
653 .clampScalar(0, S32, S64)
660 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
661 .legalFor({{S32, S1}, {S32, S32}})
673 .
legalFor({S1, S32, S64, S16, GlobalPtr,
674 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
681 .clampScalar(0, S16, S64);
706 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
707 .legalFor({S32, S64});
709 .customFor({S32, S64});
713 if (ST.has16BitInsts()) {
714 if (ST.hasVOP3PInsts())
724 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
726 if (ST.hasVOP3PInsts()) {
732 }
else if (ST.has16BitInsts()) {
742 if (ST.hasVOP3PInsts())
747 .
clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
751 .
clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
755 .
clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
758 .legalFor(FPTypesPK16)
763 if (ST.has16BitInsts()) {
765 .legalFor({S32, S64, S16})
767 .clampScalar(0, S16, S64);
772 .clampScalar(0, S32, S64);
774 if (ST.hasFractBug()) {
777 .legalFor({S32, S64})
779 .clampScalar(0, S32, S64);
784 .clampScalar(0, S32, S64);
795 .narrowScalarFor({{S64, S16}},
changeTo(0, S32))
799 if (ST.has16BitInsts()) {
802 .legalFor({S32, S16})
804 .lowerFor({S64, V2S16});
810 .lowerFor({S64, S16, V2S16});
815 .clampScalar(0, S32, S64);
819 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
820 FMad.customFor({S32, S16});
821 else if (ST.hasMadMacF32Insts())
822 FMad.customFor({S32});
823 else if (ST.hasMadF16())
824 FMad.customFor({S16});
829 if (ST.has16BitInsts()) {
830 FRem.customFor({S16, S32, S64});
832 FRem.minScalar(0, S32)
833 .customFor({S32, S64});
841 .clampMaxNumElements(0, S16, 2)
849 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
850 {S32, S1}, {S64, S1}, {S16, S1}})
852 .clampScalar(0, S32, S64)
853 .widenScalarToNextPow2(1, 32);
857 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
860 if (ST.has16BitInsts())
868 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
869 .customFor({{S64, S32}, {S64, S64}})
870 .narrowScalarFor({{S64, S16}},
changeTo(0, S32));
871 if (ST.has16BitInsts())
881 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
887 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
891 if (
ST.has16BitInsts()) {
892 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
893 .legalFor({S16, S32, S64})
894 .clampScalar(0, S16, S64)
897 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
898 .legalFor({S32, S64})
899 .clampScalar(0, S32, S64)
902 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
905 .clampScalar(0, S32, S64)
909 getActionDefinitionsBuilder(G_PTR_ADD)
912 .scalarSameSizeAs(1, 0);
914 getActionDefinitionsBuilder(G_PTRMASK)
916 .scalarSameSizeAs(1, 0)
920 getActionDefinitionsBuilder(G_ICMP)
931 .legalForCartesianProduct(
932 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
933 .legalForCartesianProduct(
934 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
935 if (
ST.has16BitInsts()) {
936 CmpBuilder.legalFor({{S1, S16}});
940 .widenScalarToNextPow2(1)
941 .clampScalar(1, S32, S64)
945 getActionDefinitionsBuilder(G_FCMP)
946 .legalForCartesianProduct({S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
947 .widenScalarToNextPow2(1)
948 .clampScalar(1, S32, S64)
952 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
953 if (
ST.has16BitInsts())
954 Exp2Ops.legalFor({S32, S16});
956 Exp2Ops.legalFor({S32});
957 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
958 Exp2Ops.scalarize(0);
960 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
961 if (
ST.has16BitInsts())
962 ExpOps.customFor({{S32}, {S16}});
964 ExpOps.customFor({S32});
965 ExpOps.clampScalar(0, MinScalarFPTy, S32)
968 getActionDefinitionsBuilder(G_FPOWI)
969 .clampScalar(0, MinScalarFPTy, S32)
973 getActionDefinitionsBuilder(G_CTPOP)
974 .legalFor({{S32, S32}, {S32, S64}})
975 .clampScalar(0, S32, S32)
976 .widenScalarToNextPow2(1, 32)
977 .clampScalar(1, S32, S64)
979 .widenScalarToNextPow2(0, 32);
985 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
987 .clampScalar(0, S32, S32)
988 .clampScalar(1, S32, S64)
989 .widenScalarToNextPow2(0, 32)
990 .widenScalarToNextPow2(1, 32)
994 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
995 .legalFor({{S32, S32}, {S32, S64}})
996 .clampScalar(0, S32, S32)
997 .clampScalar(1, S32, S64)
999 .widenScalarToNextPow2(0, 32)
1000 .widenScalarToNextPow2(1, 32);
1004 getActionDefinitionsBuilder(G_BITREVERSE)
1005 .legalFor({S32, S64})
1006 .clampScalar(0, S32, S64)
1008 .widenScalarToNextPow2(0);
1010 if (
ST.has16BitInsts()) {
1011 getActionDefinitionsBuilder(G_BSWAP)
1012 .legalFor({S16, S32, V2S16})
1013 .clampMaxNumElementsStrict(0, S16, 2)
1016 .widenScalarToNextPow2(0)
1017 .clampScalar(0, S16, S32)
1020 if (
ST.hasVOP3PInsts()) {
1021 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1022 .legalFor({S32, S16, V2S16})
1024 .clampMaxNumElements(0, S16, 2)
1026 .widenScalarToNextPow2(0)
1030 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1031 .legalFor({S32, S16})
1032 .widenScalarToNextPow2(0)
1039 getActionDefinitionsBuilder(G_BSWAP)
1044 .widenScalarToNextPow2(0)
1049 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1052 .widenScalarToNextPow2(0)
1057 getActionDefinitionsBuilder(G_INTTOPTR)
1059 .legalForCartesianProduct(AddrSpaces64, {S64})
1060 .legalForCartesianProduct(AddrSpaces32, {S32})
1073 getActionDefinitionsBuilder(G_PTRTOINT)
1075 .legalForCartesianProduct(AddrSpaces64, {S64})
1076 .legalForCartesianProduct(AddrSpaces32, {S32})
1090 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1094 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1095 bool IsLoad) ->
bool {
1099 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1111 unsigned NumRegs = (MemSize + 31) / 32;
1113 if (!
ST.hasDwordx3LoadStores())
1124 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1125 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1126 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1132 for (
unsigned Op : {G_LOAD, G_STORE}) {
1133 const bool IsStore =
Op == G_STORE;
1135 auto &Actions = getActionDefinitionsBuilder(
Op);
1138 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1139 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1140 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1141 {S64, GlobalPtr, S64, GlobalAlign32},
1142 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1143 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1144 {S32, GlobalPtr, S8, GlobalAlign8},
1145 {S32, GlobalPtr, S16, GlobalAlign16},
1147 {S32, LocalPtr, S32, 32},
1148 {S64, LocalPtr, S64, 32},
1149 {V2S32, LocalPtr, V2S32, 32},
1150 {S32, LocalPtr, S8, 8},
1151 {S32, LocalPtr, S16, 16},
1152 {V2S16, LocalPtr, S32, 32},
1154 {S32, PrivatePtr, S32, 32},
1155 {S32, PrivatePtr, S8, 8},
1156 {S32, PrivatePtr, S16, 16},
1157 {V2S16, PrivatePtr, S32, 32},
1159 {S32, ConstantPtr, S32, GlobalAlign32},
1160 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1161 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1162 {S64, ConstantPtr, S64, GlobalAlign32},
1163 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1174 Actions.customIf(
typeIs(1, Constant32Ptr));
1200 return !Query.
Types[0].isVector() &&
1201 needToSplitMemOp(Query,
Op == G_LOAD);
1203 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1208 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1211 if (DstSize > MemSize)
1217 if (MemSize > MaxSize)
1225 return Query.
Types[0].isVector() &&
1226 needToSplitMemOp(Query,
Op == G_LOAD);
1228 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1242 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1243 if (MemSize > MaxSize) {
1247 if (MaxSize % EltSize == 0) {
1248 return std::make_pair(
1253 unsigned NumPieces = MemSize / MaxSize;
1257 if (NumPieces == 1 || NumPieces >= NumElts ||
1258 NumElts % NumPieces != 0)
1259 return std::make_pair(0, EltTy);
1261 return std::make_pair(
1267 return std::make_pair(0, EltTy);
1276 return std::make_pair(
1282 return std::make_pair(0, EltTy);
1286 .widenScalarToNextPow2(0)
1292 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1293 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1294 {S32, GlobalPtr, S16, 2 * 8},
1295 {S32, LocalPtr, S8, 8},
1296 {S32, LocalPtr, S16, 16},
1297 {S32, PrivatePtr, S8, 8},
1298 {S32, PrivatePtr, S16, 16},
1299 {S32, ConstantPtr, S8, 8},
1300 {S32, ConstantPtr, S16, 2 * 8}})
1306 if (
ST.hasFlatAddressSpace()) {
1307 ExtLoads.legalForTypesWithMemDesc(
1308 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1316 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1318 ExtLoads.clampScalar(0, S32, S32)
1319 .widenScalarToNextPow2(0)
1322 auto &Atomics = getActionDefinitionsBuilder(
1323 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1324 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1325 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1327 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1328 {S64, GlobalPtr}, {S64, LocalPtr},
1329 {S32, RegionPtr}, {S64, RegionPtr}});
1330 if (
ST.hasFlatAddressSpace()) {
1331 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1334 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1335 if (
ST.hasLDSFPAtomicAdd()) {
1336 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1337 if (
ST.hasGFX90AInsts())
1338 Atomic.legalFor({{S64, LocalPtr}});
1339 if (
ST.hasGFX940Insts())
1340 Atomic.legalFor({{V2S16, LocalPtr}});
1342 if (
ST.hasAtomicFaddInsts())
1343 Atomic.legalFor({{S32, GlobalPtr}});
1345 if (
ST.hasGFX90AInsts()) {
1359 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1360 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1361 {S32, FlatPtr}, {S64, FlatPtr}})
1362 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1363 {S32, RegionPtr}, {S64, RegionPtr}});
1367 getActionDefinitionsBuilder(G_SELECT)
1368 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1369 LocalPtr, FlatPtr, PrivatePtr,
1373 .clampScalar(0, S16, S64)
1377 .clampMaxNumElements(0, S32, 2)
1378 .clampMaxNumElements(0, LocalPtr, 2)
1379 .clampMaxNumElements(0, PrivatePtr, 2)
1381 .widenScalarToNextPow2(0)
1386 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1387 .legalFor({{S32, S32}, {S64, S32}});
1388 if (
ST.has16BitInsts()) {
1389 if (
ST.hasVOP3PInsts()) {
1390 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1391 .clampMaxNumElements(0, S16, 2);
1393 Shifts.legalFor({{S16, S16}});
1396 Shifts.widenScalarIf(
1401 const LLT AmountTy = Query.
Types[1];
1405 Shifts.maxScalarIf(
typeIs(0, S16), 1, S16);
1406 Shifts.clampScalar(1, S32, S32);
1407 Shifts.widenScalarToNextPow2(0, 16);
1408 Shifts.clampScalar(0, S16, S64);
1410 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1418 Shifts.clampScalar(1, S32, S32);
1419 Shifts.widenScalarToNextPow2(0, 32);
1420 Shifts.clampScalar(0, S32, S64);
1422 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1427 Shifts.scalarize(0);
1429 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1430 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1431 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1432 unsigned IdxTypeIdx = 2;
1434 getActionDefinitionsBuilder(
Op)
1436 const LLT EltTy = Query.
Types[EltTypeIdx];
1437 const LLT VecTy = Query.
Types[VecTypeIdx];
1438 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1440 return (EltSize == 32 || EltSize == 64) &&
1454 const LLT EltTy = Query.
Types[EltTypeIdx];
1455 const LLT VecTy = Query.
Types[VecTypeIdx];
1459 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1460 return std::make_pair(
1464 .clampScalar(EltTypeIdx, S32, S64)
1465 .clampScalar(VecTypeIdx, S32, S64)
1466 .clampScalar(IdxTypeIdx, S32, S32)
1467 .clampMaxNumElements(VecTypeIdx, S32, 32)
1474 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1476 const LLT &EltTy = Query.
Types[1].getElementType();
1477 return Query.
Types[0] != EltTy;
1480 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1481 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1482 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1485 getActionDefinitionsBuilder(
Op)
1491 const LLT BigTy = Query.
Types[BigTyIdx];
1496 const LLT BigTy = Query.
Types[BigTyIdx];
1497 const LLT LitTy = Query.
Types[LitTyIdx];
1503 const LLT BigTy = Query.
Types[BigTyIdx];
1509 const LLT LitTy = Query.
Types[LitTyIdx];
1514 .widenScalarToNextPow2(BigTyIdx, 32);
1518 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1519 .legalForCartesianProduct(AllS32Vectors, {S32})
1520 .legalForCartesianProduct(AllS64Vectors, {S64})
1521 .clampNumElements(0, V16S32, V32S32)
1522 .clampNumElements(0, V2S64, V16S64)
1525 if (
ST.hasScalarPackInsts()) {
1528 .minScalarOrElt(0, S16)
1532 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1533 .legalFor({V2S16, S32})
1535 BuildVector.minScalarOrElt(0, S32);
1537 BuildVector.customFor({V2S16, S16});
1538 BuildVector.minScalarOrElt(0, S32);
1540 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1541 .customFor({V2S16, S32})
1548 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1550 .clampMaxNumElements(0, S32, 32)
1551 .clampMaxNumElements(1, S16, 2)
1552 .clampMaxNumElements(0, S16, 64);
1556 if (
ST.hasVOP3PInsts()) {
1557 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1558 .customFor({V2S16, V2S16})
1561 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1564 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1565 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1566 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1568 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1569 const LLT Ty = Query.
Types[TypeIdx];
1580 auto &
Builder = getActionDefinitionsBuilder(
Op)
1582 .lowerFor({{S16, V2S16}})
1584 const LLT BigTy = Query.
Types[BigTyIdx];
1590 .widenScalarToNextPow2(LitTyIdx, 16)
1598 .clampScalar(LitTyIdx, S32, S512)
1599 .widenScalarToNextPow2(LitTyIdx, 32)
1602 [=](
const LegalityQuery &Query) {
return notValidElt(Query, LitTyIdx); },
1605 [=](
const LegalityQuery &Query) {
return notValidElt(Query, BigTyIdx); },
1607 .clampScalar(BigTyIdx, S32, MaxScalar);
1609 if (
Op == G_MERGE_VALUES) {
1613 const LLT Ty = Query.
Types[LitTyIdx];
1621 const LLT Ty = Query.
Types[BigTyIdx];
1628 const LLT &Ty = Query.
Types[BigTyIdx];
1630 if (NewSizeInBits >= 256) {
1632 if (RoundedTo < NewSizeInBits)
1633 NewSizeInBits = RoundedTo;
1635 return std::make_pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
1644 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1645 .legalFor({{S32}, {S64}});
1647 if (
ST.hasVOP3PInsts()) {
1648 SextInReg.lowerFor({{V2S16}})
1652 .clampMaxNumElementsStrict(0, S16, 2);
1653 }
else if (
ST.has16BitInsts()) {
1654 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1658 SextInReg.lowerFor({{S32}, {S64}});
1663 .clampScalar(0, S32, S64)
1666 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1671 getActionDefinitionsBuilder(G_FSHR)
1672 .legalFor({{S32, S32}})
1673 .lowerFor({{V2S16, V2S16}})
1674 .clampMaxNumElementsStrict(0, S16, 2)
1678 if (
ST.hasVOP3PInsts()) {
1679 getActionDefinitionsBuilder(G_FSHL)
1680 .lowerFor({{V2S16, V2S16}})
1681 .clampMaxNumElementsStrict(0, S16, 2)
1685 getActionDefinitionsBuilder(G_FSHL)
1690 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1693 getActionDefinitionsBuilder(G_FENCE)
1696 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1701 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1702 .legalFor({{S32, S32}, {S64, S32}})
1703 .clampScalar(1, S32, S32)
1704 .clampScalar(0, S32, S64)
1705 .widenScalarToNextPow2(0)
1708 getActionDefinitionsBuilder({
1712 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1721 G_FMINIMUM, G_FMAXIMUM}).lower();
1723 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1726 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1727 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1728 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1731 getLegacyLegalizerInfo().computeTables();
1740 switch (
MI.getOpcode()) {
1741 case TargetOpcode::G_ADDRSPACE_CAST:
1743 case TargetOpcode::G_FRINT:
1745 case TargetOpcode::G_FCEIL:
1747 case TargetOpcode::G_FREM:
1749 case TargetOpcode::G_INTRINSIC_TRUNC:
1751 case TargetOpcode::G_SITOFP:
1753 case TargetOpcode::G_UITOFP:
1755 case TargetOpcode::G_FPTOSI:
1757 case TargetOpcode::G_FPTOUI:
1759 case TargetOpcode::G_FMINNUM:
1760 case TargetOpcode::G_FMAXNUM:
1761 case TargetOpcode::G_FMINNUM_IEEE:
1762 case TargetOpcode::G_FMAXNUM_IEEE:
1764 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1766 case TargetOpcode::G_INSERT_VECTOR_ELT:
1768 case TargetOpcode::G_SHUFFLE_VECTOR:
1770 case TargetOpcode::G_FSIN:
1771 case TargetOpcode::G_FCOS:
1773 case TargetOpcode::G_GLOBAL_VALUE:
1775 case TargetOpcode::G_LOAD:
1776 case TargetOpcode::G_SEXTLOAD:
1777 case TargetOpcode::G_ZEXTLOAD:
1779 case TargetOpcode::G_FMAD:
1781 case TargetOpcode::G_FDIV:
1783 case TargetOpcode::G_UDIV:
1784 case TargetOpcode::G_UREM:
1785 case TargetOpcode::G_UDIVREM:
1787 case TargetOpcode::G_SDIV:
1788 case TargetOpcode::G_SREM:
1789 case TargetOpcode::G_SDIVREM:
1791 case TargetOpcode::G_ATOMIC_CMPXCHG:
1793 case TargetOpcode::G_FLOG:
1795 case TargetOpcode::G_FLOG10:
1797 case TargetOpcode::G_FEXP:
1799 case TargetOpcode::G_FPOW:
1801 case TargetOpcode::G_FFLOOR:
1803 case TargetOpcode::G_BUILD_VECTOR:
1805 case TargetOpcode::G_MUL:
1807 case TargetOpcode::G_CTLZ:
1808 case TargetOpcode::G_CTTZ:
1810 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1829 if (ST.hasApertureRegs()) {
1845 B.buildInstr(AMDGPU::S_GETREG_B32)
1850 auto ShiftAmt =
B.buildConstant(S32, WidthM1 + 1);
1851 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1865 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(),
Param);
1881 B.buildPtrAdd(LoadAddr, KernargPtrReg,
1884 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1903 B.buildPtrAdd(LoadAddr, QueuePtr,
1904 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
1905 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1913 switch (
Def->getOpcode()) {
1914 case AMDGPU::G_FRAME_INDEX:
1915 case AMDGPU::G_GLOBAL_VALUE:
1916 case AMDGPU::G_BLOCK_ADDR:
1918 case AMDGPU::G_CONSTANT: {
1950 if (
TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1951 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
1960 B.buildExtract(Dst, Src, 0);
1961 MI.eraseFromParent();
1965 unsigned NullVal =
TM.getNullPointerValue(DestAS);
1967 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
1968 auto FlatNull =
B.buildConstant(SrcTy, 0);
1971 auto PtrLo32 =
B.buildExtract(DstTy, Src, 0);
1975 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1977 MI.eraseFromParent();
1992 Register SrcAsInt =
B.buildPtrToInt(S32, Src).getReg(0);
1996 auto BuildPtr =
B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1999 B.buildCopy(Dst, BuildPtr);
2000 MI.eraseFromParent();
2004 auto SegmentNull =
B.buildConstant(SrcTy,
TM.getNullPointerValue(SrcAS));
2005 auto FlatNull =
B.buildConstant(DstTy,
TM.getNullPointerValue(DestAS));
2008 SegmentNull.getReg(0));
2010 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2012 MI.eraseFromParent();
2019 B.buildExtract(Dst, Src, 0);
2020 MI.eraseFromParent();
2032 auto HighAddr =
B.buildConstant(
2034 B.buildMerge(Dst, {Src, HighAddr});
2035 MI.eraseFromParent();
2040 MF.
getFunction(),
"invalid addrspacecast",
B.getDebugLoc());
2043 Ctx.
diagnose(InvalidAddrSpaceCast);
2045 MI.eraseFromParent();
2059 auto C1 =
B.buildFConstant(Ty, C1Val);
2060 auto CopySign =
B.buildFCopysign(Ty,
C1, Src);
2063 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2064 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2066 auto C2 =
B.buildFConstant(Ty, C2Val);
2067 auto Fabs =
B.buildFAbs(Ty, Src);
2070 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2071 MI.eraseFromParent();
2089 auto Trunc =
B.buildIntrinsicTrunc(S64, Src);
2091 const auto Zero =
B.buildFConstant(S64, 0.0);
2092 const auto One =
B.buildFConstant(S64, 1.0);
2095 auto And =
B.buildAnd(S1, Lt0, NeTrunc);
2096 auto Add =
B.buildSelect(S64,
And, One, Zero);
2099 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc, Add);
2100 MI.eraseFromParent();
2108 Register Src0Reg =
MI.getOperand(1).getReg();
2109 Register Src1Reg =
MI.getOperand(2).getReg();
2110 auto Flags =
MI.getFlags();
2113 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2114 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2115 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2116 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2117 MI.eraseFromParent();
2123 const unsigned FractBits = 52;
2124 const unsigned ExpBits = 11;
2127 auto Const0 =
B.buildConstant(S32, FractBits - 32);
2128 auto Const1 =
B.buildConstant(S32, ExpBits);
2130 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32},
false)
2132 .addUse(Const0.getReg(0))
2133 .addUse(Const1.getReg(0));
2135 return B.buildSub(S32, ExpPart,
B.buildConstant(S32, 1023));
2149 auto Unmerge =
B.buildUnmerge({S32, S32}, Src);
2156 const unsigned FractBits = 52;
2159 const auto SignBitMask =
B.buildConstant(S32, UINT32_C(1) << 31);
2160 auto SignBit =
B.buildAnd(S32, Hi, SignBitMask);
2162 const auto FractMask =
B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2164 const auto Zero32 =
B.buildConstant(S32, 0);
2167 auto SignBit64 =
B.buildMerge(S64, {Zero32, SignBit});
2169 auto Shr =
B.buildAShr(S64, FractMask, Exp);
2170 auto Not =
B.buildNot(S64, Shr);
2171 auto Tmp0 =
B.buildAnd(S64, Src, Not);
2172 auto FiftyOne =
B.buildConstant(S32, FractBits - 1);
2177 auto Tmp1 =
B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2178 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2179 MI.eraseFromParent();
2195 auto Unmerge =
B.buildUnmerge({S32, S32}, Src);
2196 auto ThirtyTwo =
B.buildConstant(S32, 32);
2199 auto CvtHi =
Signed ?
B.buildSITOFP(S64, Unmerge.getReg(1))
2200 :
B.buildUITOFP(S64, Unmerge.getReg(1));
2202 auto CvtLo =
B.buildUITOFP(S64, Unmerge.getReg(0));
2203 auto LdExp =
B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64},
false)
2204 .addUse(CvtHi.getReg(0))
2205 .addUse(ThirtyTwo.getReg(0));
2208 B.buildFAdd(Dst, LdExp, CvtLo);
2209 MI.eraseFromParent();
2215 auto One =
B.buildConstant(S32, 1);
2219 auto ThirtyOne =
B.buildConstant(S32, 31);
2220 auto X =
B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2221 auto OppositeSign =
B.buildAShr(S32,
X, ThirtyOne);
2222 auto MaxShAmt =
B.buildAdd(S32, ThirtyTwo, OppositeSign);
2223 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2225 .addUse(Unmerge.getReg(1));
2226 auto LS2 =
B.buildSub(S32,
LS, One);
2227 ShAmt =
B.buildUMin(S32, LS2, MaxShAmt);
2229 ShAmt =
B.buildCTLZ(S32, Unmerge.getReg(1));
2230 auto Norm =
B.buildShl(S64, Src, ShAmt);
2231 auto Unmerge2 =
B.buildUnmerge({S32, S32}, Norm);
2232 auto Adjust =
B.buildUMin(S32, One, Unmerge2.getReg(0));
2233 auto Norm2 =
B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2234 auto FVal =
Signed ?
B.buildSITOFP(S32, Norm2) :
B.buildUITOFP(S32, Norm2);
2235 auto Scale =
B.buildSub(S32, ThirtyTwo, ShAmt);
2238 .addUse(FVal.getReg(0))
2239 .addUse(Scale.getReg(0));
2240 MI.eraseFromParent();
2260 unsigned Flags =
MI.getFlags();
2271 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2273 if (
Signed && SrcLT == S32) {
2279 Sign =
B.buildAShr(S32, Src,
B.buildConstant(S32, 31));
2280 Trunc =
B.buildFAbs(S32, Trunc, Flags);
2284 K0 =
B.buildFConstant(S64,
2286 K1 =
B.buildFConstant(S64,
2289 K0 =
B.buildFConstant(S32,
BitsToFloat(UINT32_C( 0x2f800000)));
2290 K1 =
B.buildFConstant(S32,
BitsToFloat(UINT32_C( 0xcf800000)));
2293 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2294 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2295 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2297 auto Hi = (
Signed && SrcLT == S64) ?
B.buildFPTOSI(S32, FloorMul)
2298 :
B.buildFPTOUI(S32, FloorMul);
2299 auto Lo =
B.buildFPTOUI(S32, Fma);
2301 if (
Signed && SrcLT == S32) {
2303 Sign =
B.buildMerge(S64, {Sign, Sign});
2305 B.buildSub(Dst,
B.buildXor(S64,
B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2307 B.buildMerge(Dst, {Lo, Hi});
2308 MI.eraseFromParent();
2318 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2319 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2356 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2357 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2362 MI.eraseFromParent();
2392 if (IdxVal < NumElts) {
2394 for (
unsigned i = 0;
i < NumElts; ++
i)
2396 B.buildUnmerge(SrcRegs, Vec);
2398 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2399 B.buildMerge(Dst, SrcRegs);
2404 MI.eraseFromParent();
2418 if (SrcTy == V2S16 && DstTy == V2S16 &&
2435 unsigned Flags =
MI.getFlags();
2440 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2441 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty},
false)
2442 .addUse(MulVal.getReg(0))
2443 .setMIFlags(Flags).getReg(0);
2445 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2448 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2449 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg),
false)
2452 MI.eraseFromParent();
2460 unsigned GAFlags)
const {
2498 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2509 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2512 B.buildExtract(DstReg, PCReg, 0);
2532 Fn,
"local memory global used by non-kernel function",
MI.getDebugLoc(),
2542 B.buildUndef(DstReg);
2543 MI.eraseFromParent();
2563 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2568 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32},
false);
2569 B.buildIntToPtr(DstReg, Sz);
2570 MI.eraseFromParent();
2576 *cast<GlobalVariable>(GV)));
2577 MI.eraseFromParent();
2585 MI.eraseFromParent();
2591 MI.eraseFromParent();
2609 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2610 B.buildExtract(DstReg,
Load, 0);
2612 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2614 MI.eraseFromParent();
2637 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
2639 MI.getOperand(1).setReg(Cast.getReg(0));
2644 if (
MI.getOpcode() != AMDGPU::G_LOAD)
2663 if (WideMemSize == ValSize) {
2669 MI.setMemRefs(MF, {WideMMO});
2675 if (ValSize > WideMemSize)
2682 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2683 B.buildTrunc(ValReg, WideLoad).getReg(0);
2690 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2691 B.buildExtract(ValReg, WideLoad, 0);
2695 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2696 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2700 MI.eraseFromParent();
2737 "this should not have been custom lowered");
2742 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
2744 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2748 .setMemRefs(
MI.memoperands());
2750 MI.eraseFromParent();
2758 LLT Ty =
B.getMRI()->getType(Dst);
2759 unsigned Flags =
MI.getFlags();
2761 auto Log2Operand =
B.buildFLog2(Ty, Src, Flags);
2762 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
2764 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2765 MI.eraseFromParent();
2773 unsigned Flags =
MI.getFlags();
2774 LLT Ty =
B.getMRI()->getType(Dst);
2777 auto Mul =
B.buildFMul(Ty, Src, K, Flags);
2778 B.buildFExp2(Dst,
Mul, Flags);
2779 MI.eraseFromParent();
2788 unsigned Flags =
MI.getFlags();
2789 LLT Ty =
B.getMRI()->getType(Dst);
2794 auto Log =
B.buildFLog2(S32, Src0, Flags);
2795 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32},
false)
2796 .addUse(Log.getReg(0))
2799 B.buildFExp2(Dst,
Mul, Flags);
2800 }
else if (Ty == S16) {
2802 auto Log =
B.buildFLog2(S16, Src0, Flags);
2803 auto Ext0 =
B.buildFPExt(S32, Log, Flags);
2804 auto Ext1 =
B.buildFPExt(S32, Src1, Flags);
2805 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32},
false)
2806 .addUse(Ext0.getReg(0))
2807 .addUse(Ext1.getReg(0))
2810 B.buildFExp2(Dst,
B.buildFPTrunc(S16,
Mul), Flags);
2814 MI.eraseFromParent();
2822 ModSrc = SrcFNeg->getOperand(1).getReg();
2824 ModSrc = SrcFAbs->getOperand(1).getReg();
2826 ModSrc = SrcFAbs->getOperand(1).getReg();
2837 Register OrigSrc =
MI.getOperand(1).getReg();
2838 unsigned Flags =
MI.getFlags();
2840 "this should not have been custom lowered");
2850 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64},
false)
2861 auto Const =
B.buildFConstant(S64,
BitsToDouble(0x3fefffffffffffff));
2869 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2871 B.buildFMinNum(Min, Fract, Const, Flags);
2876 CorrectedFract =
B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2879 auto NegFract =
B.buildFNeg(S64, CorrectedFract, Flags);
2880 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2882 MI.eraseFromParent();
2898 auto Merge =
B.buildMerge(S32, {Src0, Src1});
2899 B.buildBitcast(Dst,
Merge);
2901 MI.eraseFromParent();
2917 bool UsePartialMad64_32,
bool SeparateOddAlignedProducts)
const {
2931 auto getZero32 = [&]() ->
Register {
2933 Zero32 =
B.buildConstant(S32, 0).getReg(0);
2936 auto getZero64 = [&]() ->
Register {
2938 Zero64 =
B.buildConstant(S64, 0).getReg(0);
2948 if (CarryIn.empty())
2951 bool HaveCarryOut =
true;
2953 if (CarryIn.size() == 1) {
2955 LocalAccum =
B.buildZExt(S32, CarryIn[0]).getReg(0);
2959 CarryAccum = getZero32();
2961 CarryAccum =
B.buildZExt(S32, CarryIn[0]).getReg(0);
2962 for (
unsigned i = 1;
i + 1 < CarryIn.size(); ++
i) {
2964 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[
i])
2969 LocalAccum = getZero32();
2970 HaveCarryOut =
false;
2975 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
2976 LocalAccum = Add.getReg(0);
2977 return HaveCarryOut ? Add.getReg(1) :
Register();
2990 auto buildMadChain =
2993 assert((DstIndex + 1 < Accum.
size() && LocalAccum.
size() == 2) ||
2994 (DstIndex + 1 >= Accum.
size() && LocalAccum.
size() == 1));
3001 if (LocalAccum.
size() == 1 &&
3002 (!UsePartialMad64_32 || !CarryIn.empty())) {
3004 unsigned j1 = DstIndex - j0;
3005 auto Mul =
B.buildMul(S32, Src0[j0], Src1[j1]);
3006 if (!LocalAccum[0]) {
3007 LocalAccum[0] =
Mul.getReg(0);
3009 if (CarryIn.empty()) {
3010 LocalAccum[0] =
B.buildAdd(S32, LocalAccum[0],
Mul).getReg(0);
3013 B.buildUAdde(S32, S1, LocalAccum[0],
Mul, CarryIn.back())
3019 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3023 if (j0 <= DstIndex) {
3024 bool HaveSmallAccum =
false;
3027 if (LocalAccum[0]) {
3028 if (LocalAccum.
size() == 1) {
3029 Tmp =
B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3030 HaveSmallAccum =
true;
3031 }
else if (LocalAccum[1]) {
3032 Tmp =
B.buildMerge(S64, LocalAccum).getReg(0);
3033 HaveSmallAccum =
false;
3035 Tmp =
B.buildZExt(S64, LocalAccum[0]).getReg(0);
3036 HaveSmallAccum =
true;
3039 assert(LocalAccum.
size() == 1 || !LocalAccum[1]);
3041 HaveSmallAccum =
true;
3045 unsigned j1 = DstIndex - j0;
3046 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3047 {Src0[j0], Src1[j1], Tmp});
3048 Tmp = Mad.getReg(0);
3049 if (!HaveSmallAccum)
3050 CarryOut.push_back(Mad.getReg(1));
3051 HaveSmallAccum =
false;
3053 }
while (j0 <= DstIndex);
3055 auto Unmerge =
B.buildUnmerge(S32, Tmp);
3056 LocalAccum[0] = Unmerge.getReg(0);
3057 if (LocalAccum.
size() > 1)
3058 LocalAccum[1] = Unmerge.getReg(1);
3085 for (
unsigned i = 0;
i <= Accum.
size() / 2; ++
i) {
3087 Carry EvenCarryIn =
std::move(EvenCarry);
3092 if (2 *
i < Accum.
size()) {
3093 auto LocalAccum = Accum.
drop_front(2 *
i).take_front(2);
3094 EvenCarry = buildMadChain(LocalAccum, 2 *
i, EvenCarryIn);
3099 if (!SeparateOddAlignedProducts) {
3100 auto LocalAccum = Accum.
drop_front(2 *
i - 1).take_front(2);
3101 OddCarry = buildMadChain(LocalAccum, 2 *
i - 1, OddCarryIn);
3103 bool IsHighest = 2 *
i >= Accum.
size();
3106 .take_front(IsHighest ? 1 : 2);
3107 OddCarry = buildMadChain(LocalAccum, 2 *
i - 1, OddCarryIn);
3113 Lo =
B.buildUAddo(S32, S1, Accum[2 *
i - 1], SeparateOddOut[0]);
3115 Lo =
B.buildAdd(S32, Accum[2 *
i - 1], SeparateOddOut[0]);
3117 Lo =
B.buildUAdde(S32, S1, Accum[2 *
i - 1], SeparateOddOut[0],
3120 Accum[2 *
i - 1] = Lo->getOperand(0).getReg();
3123 auto Hi =
B.buildUAdde(S32, S1, Accum[2 *
i], SeparateOddOut[1],
3124 Lo->getOperand(1).getReg());
3125 Accum[2 *
i] = Hi.getReg(0);
3126 SeparateOddCarry = Hi.getReg(1);
3133 if (
Register CarryOut = mergeCarry(Accum[2 *
i - 1], OddCarryIn))
3134 EvenCarryIn.push_back(CarryOut);
3136 if (2 *
i < Accum.
size()) {
3137 if (
Register CarryOut = mergeCarry(Accum[2 *
i], EvenCarryIn))
3138 OddCarry.push_back(CarryOut);
3151 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
3164 unsigned NumParts = Size / 32;
3165 assert((Size % 32) == 0);
3180 for (
unsigned i = 0;
i < NumParts; ++
i) {
3184 B.buildUnmerge(Src0Parts, Src0);
3185 B.buildUnmerge(Src1Parts, Src1);
3188 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3189 SeparateOddAlignedProducts);
3191 B.buildMerge(DstReg, AccumRegs);
3192 MI.eraseFromParent();
3208 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
3209 ? AMDGPU::G_AMDGPU_FFBH_U32
3210 : AMDGPU::G_AMDGPU_FFBL_B32;
3211 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
3214 MI.eraseFromParent();
3220 if (
MI.getOpcode() != TargetOpcode::G_XOR)
3223 return ConstVal && *ConstVal == -1;
3230 Register CondDef =
MI.getOperand(0).getReg();
3254 if (Next == Parent->
end()) {
3258 UncondBrTarget = &*NextMBB;
3260 if (Next->getOpcode() != AMDGPU::G_BR)
3278 *ArgRC,
B.getDebugLoc(), ArgTy);
3279 if (
Arg->isMasked()) {
3282 const unsigned Mask =
Arg->getMask();
3283 const unsigned Shift = countTrailingZeros<unsigned>(
Mask);
3290 auto ShiftAmt =
B.buildConstant(S32,
Shift);
3291 AndMaskSrc =
B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3294 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(S32,
Mask >>
Shift));
3296 B.buildCopy(DstReg, LiveIn);
3315 B.buildConstant(DstReg, 0);
3321 B.buildUndef(DstReg);
3325 if (!
Arg->isRegister() || !
Arg->getRegister().isValid())
3336 MI.eraseFromParent();
3342 B.buildConstant(
MI.getOperand(0).getReg(),
C);
3343 MI.eraseFromParent();
3364 B.buildUndef(DstReg);
3365 MI.eraseFromParent();
3369 if (
Arg->isMasked()) {
3383 MI.eraseFromParent();
3388 int64_t Offset)
const {
3390 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
3398 auto COffset =
B.buildConstant(
LLT::scalar(64), Offset);
3400 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3408 Align Alignment)
const {
3412 "unexpected kernarg parameter type");
3416 B.buildLoad(DstReg, Ptr, PtrInfo,
Align(4),
3419 MI.eraseFromParent();
3454 auto FloatY =
B.buildUITOFP(S32,
Y);
3455 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3456 auto Scale =
B.buildFConstant(S32,
BitsToFloat(0x4f7ffffe));
3457 auto ScaledY =
B.buildFMul(S32, RcpIFlag, Scale);
3458 auto Z =
B.buildFPTOUI(S32, ScaledY);
3461 auto NegY =
B.buildSub(S32,
B.buildConstant(S32, 0),
Y);
3462 auto NegYZ =
B.buildMul(S32, NegY, Z);
3463 Z =
B.buildAdd(S32, Z,
B.buildUMulH(S32, Z, NegYZ));
3466 auto Q =
B.buildUMulH(S32,
X, Z);
3467 auto R =
B.buildSub(S32,
X,
B.buildMul(S32, Q,
Y));
3470 auto One =
B.buildConstant(S32, 1);
3473 Q =
B.buildSelect(S32,
Cond,
B.buildAdd(S32, Q, One), Q);
3474 R =
B.buildSelect(S32,
Cond,
B.buildSub(S32, R,
Y), R);
3479 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(S32, Q, One), Q);
3482 B.buildSelect(DstRemReg,
Cond,
B.buildSub(S32, R,
Y), R);
3501 auto Unmerge =
B.buildUnmerge(S32, Val);
3503 auto CvtLo =
B.buildUITOFP(S32, Unmerge.getReg(0));
3504 auto CvtHi =
B.buildUITOFP(S32, Unmerge.getReg(1));
3506 auto Mad =
B.buildFMAD(S32, CvtHi,
3507 B.buildFConstant(S32,
BitsToFloat(0x4f800000)), CvtLo);
3509 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3511 B.buildFMul(S32, Rcp,
B.buildFConstant(S32,
BitsToFloat(0x5f7ffffc)));
3515 B.buildFMul(S32, Mul1,
B.buildFConstant(S32,
BitsToFloat(0x2f800000)));
3516 auto Trunc =
B.buildIntrinsicTrunc(S32, Mul2);
3519 auto Mad2 =
B.buildFMAD(S32, Trunc,
3522 auto ResultLo =
B.buildFPTOUI(S32, Mad2);
3523 auto ResultHi =
B.buildFPTOUI(S32, Trunc);
3525 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3540 auto Rcp =
B.buildMerge(S64, {RcpLo, RcpHi});
3542 auto Zero64 =
B.buildConstant(S64, 0);
3543 auto NegDenom =
B.buildSub(S64, Zero64, Denom);
3545 auto MulLo1 =
B.buildMul(S64, NegDenom, Rcp);
3546 auto MulHi1 =
B.buildUMulH(S64, Rcp, MulLo1);
3548 auto UnmergeMulHi1 =
B.buildUnmerge(S32, MulHi1);
3549 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3550 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3552 auto Add1_Lo =
B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3553 auto Add1_Hi =
B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3554 auto Add1 =
B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3556 auto MulLo2 =
B.buildMul(S64, NegDenom, Add1);
3557 auto MulHi2 =
B.buildUMulH(S64, Add1, MulLo2);
3558 auto UnmergeMulHi2 =
B.buildUnmerge(S32, MulHi2);
3559 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3560 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3562 auto Zero32 =
B.buildConstant(S32, 0);
3563 auto Add2_Lo =
B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3564 auto Add2_Hi =
B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3565 auto Add2 =
B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3567 auto UnmergeNumer =
B.buildUnmerge(S32, Numer);
3568 Register NumerLo = UnmergeNumer.getReg(0);
3569 Register NumerHi = UnmergeNumer.getReg(1);
3571 auto MulHi3 =
B.buildUMulH(S64, Numer, Add2);
3572 auto Mul3 =
B.buildMul(S64, Denom, MulHi3);
3573 auto UnmergeMul3 =
B.buildUnmerge(S32, Mul3);
3574 Register Mul3_Lo = UnmergeMul3.getReg(0);
3575 Register Mul3_Hi = UnmergeMul3.getReg(1);
3576 auto Sub1_Lo =
B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3577 auto Sub1_Hi =
B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3578 auto Sub1_Mi =
B.buildSub(S32, NumerHi, Mul3_Hi);
3579 auto Sub1 =
B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3581 auto UnmergeDenom =
B.buildUnmerge(S32, Denom);
3582 Register DenomLo = UnmergeDenom.getReg(0);
3583 Register DenomHi = UnmergeDenom.getReg(1);
3586 auto C1 =
B.buildSExt(S32, CmpHi);
3589 auto C2 =
B.buildSExt(S32, CmpLo);
3592 auto C3 =
B.buildSelect(S32, CmpEq, C2,
C1);
3599 auto Sub2_Lo =
B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3600 auto Sub2_Mi =
B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3601 auto Sub2_Hi =
B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3602 auto Sub2 =
B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3604 auto One64 =
B.buildConstant(S64, 1);
3605 auto Add3 =
B.buildAdd(S64, MulHi3, One64);
3611 auto C6 =
B.buildSelect(
3615 auto Add4 =
B.buildAdd(S64, Add3, One64);
3616 auto Sub3_Lo =
B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3618 auto Sub3_Mi =
B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3619 auto Sub3_Hi =
B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3620 auto Sub3 =
B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3626 auto Sel1 =
B.buildSelect(
3633 auto Sel2 =
B.buildSelect(
3644 switch (
MI.getOpcode()) {
3647 case AMDGPU::G_UDIV: {
3648 DstDivReg =
MI.getOperand(0).getReg();
3651 case AMDGPU::G_UREM: {
3652 DstRemReg =
MI.getOperand(0).getReg();
3655 case AMDGPU::G_UDIVREM: {
3656 DstDivReg =
MI.getOperand(0).getReg();
3657 DstRemReg =
MI.getOperand(1).getReg();
3664 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
3665 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
3666 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
3676 MI.eraseFromParent();
3687 if (Ty != S32 && Ty != S64)
3690 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
3694 auto SignBitOffset =
B.buildConstant(S32, Ty.
getSizeInBits() - 1);
3695 auto LHSign =
B.buildAShr(Ty,
LHS, SignBitOffset);
3696 auto RHSign =
B.buildAShr(Ty,
RHS, SignBitOffset);
3698 LHS =
B.buildAdd(Ty,
LHS, LHSign).getReg(0);
3699 RHS =
B.buildAdd(Ty,
RHS, RHSign).getReg(0);
3701 LHS =
B.buildXor(Ty,
LHS, LHSign).getReg(0);
3702 RHS =
B.buildXor(Ty,
RHS, RHSign).getReg(0);
3704 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3705 switch (
MI.getOpcode()) {
3708 case AMDGPU::G_SDIV: {
3709 DstDivReg =
MI.getOperand(0).getReg();
3713 case AMDGPU::G_SREM: {
3714 DstRemReg =
MI.getOperand(0).getReg();
3718 case AMDGPU::G_SDIVREM: {
3719 DstDivReg =
MI.getOperand(0).getReg();
3720 DstRemReg =
MI.getOperand(1).getReg();
3733 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
3734 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3735 B.buildSub(DstDivReg, SignXor, Sign);
3739 auto Sign = LHSign.getReg(0);
3740 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3741 B.buildSub(DstRemReg, SignXor, Sign);
3744 MI.eraseFromParent();
3761 if (!AllowInaccurateRcp)
3766 if (CLHS->isExactlyValue(1.0)) {
3767 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res,
false)
3771 MI.eraseFromParent();
3776 if (CLHS->isExactlyValue(-1.0)) {
3777 auto FNeg =
B.buildFNeg(ResTy,
RHS, Flags);
3778 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res,
false)
3779 .addUse(FNeg.getReg(0))
3782 MI.eraseFromParent();
3788 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy},
false)
3791 B.buildFMul(Res,
LHS,
RCP, Flags);
3793 MI.eraseFromParent();
3810 if (!AllowInaccurateRcp)
3813 auto NegY =
B.buildFNeg(ResTy,
Y);
3814 auto One =
B.buildFConstant(ResTy, 1.0);
3816 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy},
false)
3820 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
3821 R =
B.buildFMA(ResTy, Tmp0, R, R);
3823 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
3824 R =
B.buildFMA(ResTy, Tmp1, R, R);
3826 auto Ret =
B.buildFMul(ResTy,
X, R);
3827 auto Tmp2 =
B.buildFMA(ResTy, NegY,
Ret,
X);
3829 B.buildFMA(Res, Tmp2, R,
Ret);
3830 MI.eraseFromParent();
3849 auto LHSExt =
B.buildFPExt(S32,
LHS, Flags);
3850 auto RHSExt =
B.buildFPExt(S32,
RHS, Flags);
3852 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32},
false)
3853 .addUse(RHSExt.getReg(0))
3856 auto QUOT =
B.buildFMul(S32, LHSExt,
RCP, Flags);
3857 auto RDst =
B.buildFPTrunc(S16, QUOT, Flags);
3859 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res,
false)
3860 .addUse(RDst.getReg(0))
3865 MI.eraseFromParent();
3874 AMDGPU::SIModeRegisterDefaults
Mode) {
3876 unsigned SPDenormMode =
3879 if (
ST.hasDenormModeInst()) {
3881 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
3883 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3884 B.buildInstr(AMDGPU::S_DENORM_MODE)
3885 .addImm(NewDenormModeValue);
3893 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3894 .addImm(SPDenormMode)
3895 .addImm(SPDenormModeBitField);
3916 auto One =
B.buildFConstant(S32, 1.0
f);
3918 auto DenominatorScaled =
3919 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1},
false)
3924 auto NumeratorScaled =
3925 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1},
false)
3931 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32},
false)
3932 .addUse(DenominatorScaled.getReg(0))
3934 auto NegDivScale0 =
B.buildFNeg(S32, DenominatorScaled, Flags);
3938 if (!
Mode.allFP32Denormals())
3941 auto Fma0 =
B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3942 auto Fma1 =
B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3943 auto Mul =
B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3944 auto Fma2 =
B.buildFMA(S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
3945 auto Fma3 =
B.buildFMA(S32, Fma2, Fma1,
Mul, Flags);
3946 auto Fma4 =
B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3948 if (!
Mode.allFP32Denormals())
3951 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32},
false)
3952 .addUse(Fma4.getReg(0))
3953 .addUse(Fma1.getReg(0))
3954 .addUse(Fma3.getReg(0))
3955 .addUse(NumeratorScaled.getReg(1))
3958 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res,
false)
3959 .addUse(Fmas.getReg(0))
3964 MI.eraseFromParent();
3983 auto One =
B.buildFConstant(S64, 1.0);
3985 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1},
false)
3991 auto NegDivScale0 =
B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3993 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64},
false)
3994 .addUse(DivScale0.getReg(0))
3997 auto Fma0 =
B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3998 auto Fma1 =
B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3999 auto Fma2 =
B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4001 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1},
false)
4007 auto Fma3 =
B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4008 auto Mul =
B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4009 auto Fma4 =
B.buildFMA(S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
4018 auto NumUnmerge =
B.buildUnmerge(S32,
LHS);
4019 auto DenUnmerge =
B.buildUnmerge(S32,
RHS);
4020 auto Scale0Unmerge =
B.buildUnmerge(S32, DivScale0);
4021 auto Scale1Unmerge =
B.buildUnmerge(S32, DivScale1);
4024 Scale1Unmerge.getReg(1));
4026 Scale0Unmerge.getReg(1));
4027 Scale =
B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4029 Scale = DivScale1.getReg(1);
4032 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64},
false)
4033 .addUse(Fma4.getReg(0))
4034 .addUse(Fma3.getReg(0))
4035 .addUse(
Mul.getReg(0))
4039 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
makeArrayRef(Res),
false)
4040 .addUse(Fmas.getReg(0))
4045 MI.eraseFromParent();
4060 auto Abs =
B.buildFAbs(S32,
RHS, Flags);
4063 auto C0 =
B.buildConstant(S32, 0x6f800000);
4064 auto C1 =
B.buildConstant(S32, 0x2f800000);
4068 auto Sel =
B.buildSelect(S32, CmpRes,
C1, C2, Flags);
4070 auto Mul0 =
B.buildFMul(S32,
RHS, Sel, Flags);
4072 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32},
false)
4073 .addUse(Mul0.getReg(0))
4076 auto Mul1 =
B.buildFMul(S32,
LHS,
RCP, Flags);
4078 B.buildFMul(Res, Sel, Mul1, Flags);
4080 MI.eraseFromParent();
4098 auto Flags =
MI.getFlags();
4110 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty},
false)
4120 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4121 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4126 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4128 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4129 MI.eraseFromParent();
4135 case Intrinsic::amdgcn_ds_fadd:
4136 return AMDGPU::G_ATOMICRMW_FADD;
4137 case Intrinsic::amdgcn_ds_fmin:
4138 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4139 case Intrinsic::amdgcn_ds_fmax:
4140 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4156 for (
int I = 6;
I > 3; --
I)
4157 MI.removeOperand(
I);
4159 MI.removeOperand(1);
4179 B.buildPtrAdd(DstReg, KernargPtrReg,
B.buildConstant(IdxTy, Offset).getReg(0));
4196 MI.eraseFromParent();
4203 unsigned AddrSpace)
const {
4205 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32),
MI.getOperand(2).getReg());
4209 MI.eraseFromParent();
4219 std::pair<Register, unsigned>
4222 const unsigned MaxImm = 4095;
4228 std::tie(BaseReg, ImmOffset) =
4233 BaseReg =
B.buildPtrToInt(
MRI.
getType(OrigOffset), BaseReg).getReg(0);
4242 unsigned Overflow = ImmOffset & ~MaxImm;
4243 ImmOffset -= Overflow;
4244 if ((int32_t)Overflow < 0) {
4245 Overflow += ImmOffset;
4249 if (Overflow != 0) {
4251 BaseReg =
B.buildConstant(S32, Overflow).getReg(0);
4253 auto OverflowVal =
B.buildConstant(S32, Overflow);
4254 BaseReg =
B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4259 BaseReg =
B.buildConstant(S32, 0).getReg(0);
4261 return std::make_pair(BaseReg, ImmOffset);
4267 unsigned ImmOffset,
Register VIndex,
4278 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4279 MaybeVIndexVal->
Value == 0) {
4293 bool ImageStore)
const {
4300 auto Unmerge =
B.buildUnmerge(S16,
Reg);
4303 for (
int I = 0,
E = Unmerge->getNumOperands() - 1;
I !=
E; ++
I)
4304 WideRegs.push_back(
B.buildAnyExt(S32, Unmerge.getReg(
I)).getReg(0));
4315 Reg =
B.buildBitcast(S32,
Reg).getReg(0);
4316 PackedRegs.push_back(
Reg);
4317 PackedRegs.
resize(2,
B.buildUndef(S32).getReg(0));
4324 auto Unmerge =
B.buildUnmerge(S16,
Reg);
4325 for (
int I = 0,
E = Unmerge->getNumOperands() - 1;
I !=
E; ++
I)
4326 PackedRegs.push_back(Unmerge.getReg(
I));
4327 PackedRegs.
resize(6,
B.buildUndef(S16).getReg(0));
4335 auto Unmerge =
B.buildUnmerge(S32,
Reg);
4336 for (
int I = 0,
E = Unmerge->getNumOperands() - 1;
I !=
E; ++
I)
4337 PackedRegs.push_back(Unmerge.getReg(
I));
4338 PackedRegs.
resize(4,
B.buildUndef(S32).getReg(0));
4380 bool IsFormat)
const {
4384 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
4391 const int MemSize = MMO->
getSize();
4396 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4399 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
4403 VIndex =
MI.getOperand(3).getReg();
4406 VIndex =
B.buildConstant(S32, 0).getReg(0);
4409 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
4410 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
4412 unsigned Format = 0;
4414 Format =
MI.getOperand(5 + OpOffset).getImm();
4418 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
4425 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4426 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4427 }
else if (IsFormat) {
4428 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4429 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4433 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4436 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4439 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4444 auto MIB =
B.buildInstr(Opc)
4455 MIB.addImm(AuxiliaryData)
4456 .addImm(HasVIndex ? -1 : 0)
4457 .addMemOperand(MMO);
4459 MI.eraseFromParent();
4467 bool IsTyped)
const {
4477 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4480 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
4484 VIndex =
MI.getOperand(3).getReg();
4487 VIndex =
B.buildConstant(S32, 0).getReg(0);
4490 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
4491 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
4493 unsigned Format = 0;
4495 Format =
MI.getOperand(5 + OpOffset).getImm();
4499 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
4504 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
4513 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4514 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4515 }
else if (IsFormat) {
4516 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4517 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4521 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4524 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4527 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4539 LoadDstReg =
B.getMRI()->createGenericVirtualRegister(S32);
4540 else if (Unpacked && IsD16 && Ty.
isVector())
4541 LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4545 auto MIB =
B.buildInstr(Opc)
4556 MIB.addImm(AuxiliaryData)
4557 .addImm(HasVIndex ? -1 : 0)
4558 .addMemOperand(MMO);
4560 if (LoadDstReg != Dst) {
4561 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
4565 B.buildTrunc(Dst, LoadDstReg);
4569 auto Unmerge =
B.buildUnmerge(S32, LoadDstReg);
4571 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
4572 Repack.push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
4573 B.buildMerge(Dst, Repack);
4577 MI.eraseFromParent();
4584 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4585 AMDGPU::G_AMDGPU_ATOMIC_DEC;
4587 .addDef(
MI.getOperand(0).getReg())
4588 .addUse(
MI.getOperand(2).getReg())
4589 .addUse(
MI.getOperand(3).getReg())
4591 MI.eraseFromParent();
4597 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4598 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4599 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4600 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4601 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4602 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4603 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4604 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4605 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4606 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4607 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4608 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4609 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4610 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4611 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4612 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4613 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4614 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4615 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4616 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4617 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4618 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4619 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4620 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4621 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4622 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4623 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4624 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4625 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4626 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4627 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4628 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4629 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4630 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4631 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4632 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4633 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4634 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4635 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4636 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4637 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4638 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4639 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4640 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4641 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4642 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4643 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4644 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4653 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4654 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4655 const bool HasReturn =
MI.getNumExplicitDefs() != 0;
4662 Dst =
MI.getOperand(0).getReg();
4667 Register VData =
MI.getOperand(2 + OpOffset).getReg();
4671 CmpVal =
MI.getOperand(3 + OpOffset).getReg();
4675 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
4676 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4679 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
4682 VIndex =
MI.getOperand(4 + OpOffset).getReg();
4688 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
4689 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
4690 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
4713 .addImm(AuxiliaryData)
4714 .addImm(HasVIndex ? -1 : 0)
4715 .addMemOperand(MMO);
4717 MI.eraseFromParent();
4726 const AMDGPU::ImageDimIntrinsicInfo *
Intr,
4727 bool IsA16,
bool IsG16) {
4730 auto EndIdx =
Intr->VAddrEnd;
4732 for (
unsigned I =
Intr->VAddrStart;
I < EndIdx;
I++) {
4739 if ((I < Intr->GradientStart) ||
4740 (
I >=
Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4741 (
I >=
Intr->CoordStart && !IsA16)) {
4742 if ((I < Intr->GradientStart) && IsA16 &&
4743 (
B.getMRI()->getType(AddrReg) == S16)) {
4744 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
4747 PackedAddrs.push_back(
4748 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4752 "Bias needs to be converted to 16 bit in A16 mode");
4754 AddrReg =
B.buildBitcast(V2S16, AddrReg).getReg(0);
4755 PackedAddrs.push_back(AddrReg);
4760 if (((
I + 1) >= EndIdx) ||
4761 ((
Intr->NumGradients / 2) % 2 == 1 &&
4762 (
I ==
static_cast<unsigned>(
Intr->GradientStart +
4763 (
Intr->NumGradients / 2) - 1) ||
4764 I ==
static_cast<unsigned>(
Intr->GradientStart +
4765 Intr->NumGradients - 1))) ||
4767 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
4768 PackedAddrs.push_back(
4769 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4772 PackedAddrs.push_back(
4774 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4785 int DimIdx,
int NumVAddrs) {
4789 for (
int I = 0;
I != NumVAddrs; ++
I) {
4791 if (
SrcOp.isReg()) {
4797 int NumAddrRegs = AddrRegs.size();
4798 if (NumAddrRegs != 1) {
4802 auto Undef =
B.buildUndef(S32);
4803 AddrRegs.
append(RoundedNumRegs - NumAddrRegs,
Undef.getReg(0));
4804 NumAddrRegs = RoundedNumRegs;
4809 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4812 for (
int I = 1;
I != NumVAddrs; ++
I) {
4815 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
4836 const unsigned NumDefs =
MI.getNumExplicitDefs();
4837 const unsigned ArgOffset = NumDefs + 1;
4838 bool IsTFE = NumDefs == 2;
4852 Register VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4860 const bool IsG16 = GradTy == S16;
4861 const bool IsA16 = AddrTy == S16;
4865 if (!BaseOpcode->
Atomic) {
4866 DMask =
MI.getOperand(ArgOffset +
Intr->DMaskIndex).getImm();
4869 }
else if (DMask != 0) {
4871 }
else if (!IsTFE && !BaseOpcode->
Store) {
4873 B.buildUndef(
MI.getOperand(0));
4874 MI.eraseFromParent();
4882 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4883 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4884 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4885 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4886 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4889 MI.setDesc(
B.getTII().get(NewOpcode));
4893 if (IsTFE && DMask == 0) {
4896 MI.getOperand(ArgOffset +
Intr->DMaskIndex).setImm(DMask);
4899 if (BaseOpcode->
Atomic) {
4911 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
4912 MI.getOperand(2).setReg(
Concat.getReg(0));
4913 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4917 unsigned CorrectedNumVAddrs =
Intr->NumVAddrs;
4926 if (IsA16 && !ST.
hasA16()) {
4931 if (IsA16 || IsG16) {
4932 if (
Intr->NumVAddrs > 1) {
4939 const bool UseNSA = ST.
hasNSAEncoding() && PackedRegs.size() >= 3 &&
4942 if (!UseNSA && PackedRegs.size() > 1) {
4944 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
4945 PackedRegs[0] =
Concat.getReg(0);
4949 const unsigned NumPacked = PackedRegs.size();
4950 for (
unsigned I =
Intr->VAddrStart; I < Intr->VAddrEnd;
I++) {
4952 if (!
SrcOp.isReg()) {
4959 if (
I -
Intr->VAddrStart < NumPacked)
4960 SrcOp.setReg(PackedRegs[
I -
Intr->VAddrStart]);
4962 SrcOp.setReg(AMDGPU::NoRegister);
4981 const bool UseNSA = ST.
hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4984 if (!UseNSA &&
Intr->NumVAddrs > 1)
4996 if (BaseOpcode->
Store) {
5002 if (RepackedReg != VData) {
5003 MI.getOperand(1).setReg(RepackedReg);
5014 if (NumElts < DMaskLanes)
5017 if (NumElts > 4 || DMaskLanes > 4)
5020 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5021 const LLT AdjustedTy =
5042 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
5043 unsigned RoundedSize = 32 * RoundedElts;
5047 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
5052 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
5058 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
5062 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
5063 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
5067 MI.getOperand(0).setReg(NewResultReg);
5075 Dst1Reg =
MI.getOperand(1).getReg();
5080 MI.removeOperand(1);
5084 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
5093 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
5095 if (ResultNumRegs == 1) {
5097 ResultRegs[0] = NewResultReg;
5100 for (
int I = 0;
I != NumDataRegs; ++
I)
5102 B.buildUnmerge(ResultRegs, NewResultReg);
5107 ResultRegs.
resize(NumDataRegs);
5113 B.buildTrunc(DstReg, ResultRegs[0]);
5119 B.buildBitcast(DstReg, ResultRegs[0]);
5133 Reg =
B.buildBitcast(V2S16,
Reg).getReg(0);
5136 Reg =
B.buildTrunc(S16,
Reg).getReg(0);
5140 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
5144 for (
int I = 0;
I != NumElts; ++
I)
5145 ResultRegs.push_back(
Undef);
5151 padWithUndef(ResTy, NumElts - ResultRegs.size());
5152 B.buildBuildVector(DstReg, ResultRegs);
5163 if (ResultRegs.size() == 1) {
5164 NewResultReg = ResultRegs[0];
5165 }
else if (ResultRegs.size() == 2) {
5167 NewResultReg =
B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
5175 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
5177 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
5182 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
5183 B.buildConcatVectors(DstReg, ResultRegs);
5193 LLT Ty =
B.getMRI()->getType(Dst);
5202 Dst =
MI.getOperand(0).getReg();
5203 B.setInsertPt(
B.getMBB(),
MI);
5209 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
5210 MI.removeOperand(1);
5214 const unsigned MemSize = (Size + 7) / 8;
5215 const Align MemAlign(4);
5221 MI.addMemOperand(MF, MMO);
5246 switch (*HsaAbiVer) {