34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
37#define DEBUG_TYPE "amdgpu-legalinfo"
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
47 "amdgpu-global-isel-new-legality",
48 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
74 const LLT Ty = Query.Types[TypeIdx];
81 EltSize > 1 && EltSize < 32 &&
88 const LLT Ty = Query.Types[TypeIdx];
95 const LLT Ty = Query.Types[TypeIdx];
103 const LLT Ty = Query.Types[TypeIdx];
105 return std::pair(TypeIdx,
112 const LLT Ty = Query.Types[TypeIdx];
115 unsigned Pieces = (
Size + 63) / 64;
126 const LLT Ty = Query.Types[TypeIdx];
131 const int NextMul32 = (
Size + 31) / 32;
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
143 const LLT Ty = Query.Types[TypeIdx];
148 assert(EltSize == 32 || EltSize == 64);
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
190 const LLT Ty = Query.Types[TypeIdx];
197 const LLT Ty = Query.Types[TypeIdx];
207 const LLT QueryTy = Query.Types[TypeIdx];
214 const LLT QueryTy = Query.Types[TypeIdx];
221 const LLT QueryTy = Query.Types[TypeIdx];
232 return EltSize == 16 || EltSize % 32 == 0;
237 return EltSize == 32 || EltSize == 64 ||
239 EltSize == 128 || EltSize == 256;
263 LLT Ty = Query.Types[TypeIdx];
271 const LLT QueryTy = Query.Types[TypeIdx];
283 const LLT Ty = Query.Types[TypeIdx];
285 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.
getSizeInBits();
293 bool IsLoad,
bool IsAtomic) {
297 return ST.enableFlatScratch() ? 128 : 32;
299 return ST.useDS128() ? 128 : 64;
310 return IsLoad ? 512 : 128;
315 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
324 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
329 unsigned AS = Query.
Types[1].getAddressSpace();
343 if (IsLoad && MemSize <
Size)
344 MemSize = std::max(MemSize,
Align);
353 AtomicOrdering::NotAtomic))
364 if (!ST.hasDwordx3LoadStores())
377 if (AlignBits < MemSize) {
380 Align(AlignBits / 8)))
424 return EltSize != 32 && EltSize != 64;
439 if (
Size != MemSizeInBits)
455 uint64_t AlignInBits,
unsigned AddrSpace,
465 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
476 if (AlignInBits < RoundedSize)
483 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
490 if (Query.
MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
515 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
518 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
519 std::array<Register, 4> VectorElems;
520 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
521 for (
unsigned I = 0;
I < NumParts; ++
I)
523 B.buildExtractVectorElementConstant(S32, VectorReg,
I).getReg(0);
524 B.buildMergeValues(MO, VectorElems);
528 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
529 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
530 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
531 B.buildIntToPtr(MO, Scalar);
551 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
552 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
553 for (
unsigned I = 0;
I < NumParts; ++
I)
555 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
557 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
558 return B.buildBitcast(VectorTy, Scalar).getReg(0);
575 using namespace TargetOpcode;
577 auto GetAddrSpacePtr = [&
TM](
unsigned AS) {
621 std::initializer_list<LLT> AllS32Vectors =
622 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
623 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
624 std::initializer_list<LLT> AllS64Vectors =
625 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
637 const LLT CodePtr = FlatPtr;
639 const std::initializer_list<LLT> AddrSpaces64 = {
640 GlobalPtr, ConstantPtr, FlatPtr
643 const std::initializer_list<LLT> AddrSpaces32 = {
644 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
647 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
649 const std::initializer_list<LLT> FPTypesBase = {
653 const std::initializer_list<LLT> FPTypes16 = {
657 const std::initializer_list<LLT> FPTypesPK16 = {
669 .
legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
670 .legalFor(AllS32Vectors)
685 .legalFor({S32, S16, V2S16})
686 .clampMaxNumElementsStrict(0, S16, 2)
694 .clampMaxNumElementsStrict(0, S16, 2)
702 .legalFor({S32, S16, V2S16})
703 .minScalarOrElt(0, S16)
710 .legalFor({S32, S16})
720 .widenScalarToNextMultipleOf(0, 32)
727 .legalFor({S32, S16})
742 .widenScalarToNextMultipleOf(0, 32)
750 .widenScalarToNextMultipleOf(0, 32);
755 Mul.maxScalar(0, S32);
761 .minScalarOrElt(0, S32)
780 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
781 .customFor({S32, S64})
782 .clampScalar(0, S32, S64)
792 .clampMaxNumElements(0, S8, 2)
803 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
804 .clampScalar(0, S32, S64)
811 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
812 .legalFor({{S32, S1}, {S32, S32}})
813 .clampScalar(0, S32, S32)
823 .
legalFor({S1, S32, S64, S16, GlobalPtr,
824 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
831 .clampScalar(0, S16, S64);
861 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
862 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
863 .legalFor({S32, S64});
865 .customFor({S32, S64});
885 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
919 .legalFor(FPTypesPK16)
927 .customFor({S32, S64})
933 .clampScalar(0, S16, S64);
936 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
938 .maxScalarIf(
typeIs(0, S16), 1, S16)
939 .clampScalar(1, S32, S32)
943 .
customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
956 .legalFor({S32, S64})
958 .clampScalar(0, S32, S64);
963 .clampScalar(0, S32, S64);
967 .legalFor({{S32, S32}, {S64, S32}})
969 .clampScalar(0, S32, S64)
970 .clampScalar(1, S32, S32)
977 .clampScalar(1, S32, S32)
988 .narrowScalarFor({{S64, S16}},
changeTo(0, S32))
997 .lowerFor({S64, V2S16});
1003 .lowerFor({S64, S16, V2S16});
1013 FMad.customFor({S32, S16});
1015 FMad.customFor({S32});
1017 FMad.customFor({S16});
1023 FRem.customFor({S16, S32, S64});
1025 FRem.minScalar(0, S32)
1026 .customFor({S32, S64});
1034 .clampMaxNumElements(0, S16, 2)
1042 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1043 {S32, S1}, {S64, S1}, {S16, S1}})
1045 .clampScalar(0, S32, S64)
1046 .widenScalarToNextPow2(1, 32);
1050 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1061 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1062 .customFor({{S64, S32}, {S64, S64}})
1063 .narrowScalarFor({{S64, S16}},
changeTo(0, S32));
1074 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1080 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1084 if (
ST.has16BitInsts()) {
1085 getActionDefinitionsBuilder(
1086 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1087 .legalFor({S16, S32, S64})
1088 .clampScalar(0, S16, S64)
1091 getActionDefinitionsBuilder(
1092 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1093 .legalFor({S32, S64})
1094 .clampScalar(0, S32, S64)
1097 getActionDefinitionsBuilder(
1098 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1101 .clampScalar(0, S32, S64)
1105 getActionDefinitionsBuilder(G_PTR_ADD)
1106 .unsupportedFor({BufferFatPtr, RsrcPtr})
1109 .scalarSameSizeAs(1, 0);
1111 getActionDefinitionsBuilder(G_PTRMASK)
1113 .scalarSameSizeAs(1, 0)
1117 getActionDefinitionsBuilder(G_ICMP)
1128 .legalForCartesianProduct(
1129 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1130 .legalForCartesianProduct(
1131 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1132 if (
ST.has16BitInsts()) {
1133 CmpBuilder.legalFor({{S1, S16}});
1137 .widenScalarToNextPow2(1)
1138 .clampScalar(1, S32, S64)
1143 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1144 {S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1146 if (
ST.hasSALUFloatInsts())
1147 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1150 .widenScalarToNextPow2(1)
1151 .clampScalar(1, S32, S64)
1155 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1156 if (
ST.has16BitInsts())
1157 ExpOps.customFor({{S32}, {S16}});
1159 ExpOps.customFor({S32});
1160 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1163 getActionDefinitionsBuilder(G_FPOWI)
1164 .clampScalar(0, MinScalarFPTy, S32)
1167 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1168 Log2Ops.customFor({S32});
1169 if (
ST.has16BitInsts())
1170 Log2Ops.legalFor({S16});
1172 Log2Ops.customFor({S16});
1173 Log2Ops.scalarize(0)
1176 auto &LogOps = getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP});
1177 LogOps.customFor({S32, S16});
1178 LogOps.clampScalar(0, MinScalarFPTy, S32)
1182 getActionDefinitionsBuilder(G_CTPOP)
1183 .legalFor({{S32, S32}, {S32, S64}})
1184 .clampScalar(0, S32, S32)
1185 .widenScalarToNextPow2(1, 32)
1186 .clampScalar(1, S32, S64)
1188 .widenScalarToNextPow2(0, 32);
1191 if (
ST.has16BitInsts())
1192 getActionDefinitionsBuilder(G_IS_FPCLASS)
1193 .legalForCartesianProduct({S1}, FPTypes16)
1194 .widenScalarToNextPow2(1)
1198 getActionDefinitionsBuilder(G_IS_FPCLASS)
1199 .legalForCartesianProduct({S1}, FPTypesBase)
1200 .lowerFor({S1, S16})
1201 .widenScalarToNextPow2(1)
1208 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1210 .clampScalar(0, S32, S32)
1211 .clampScalar(1, S32, S64)
1212 .widenScalarToNextPow2(0, 32)
1213 .widenScalarToNextPow2(1, 32)
1217 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1218 .legalFor({{S32, S32}, {S32, S64}})
1219 .clampScalar(0, S32, S32)
1220 .clampScalar(1, S32, S64)
1222 .widenScalarToNextPow2(0, 32)
1223 .widenScalarToNextPow2(1, 32);
1227 getActionDefinitionsBuilder(G_BITREVERSE)
1228 .legalFor({S32, S64})
1229 .clampScalar(0, S32, S64)
1231 .widenScalarToNextPow2(0);
1233 if (
ST.has16BitInsts()) {
1234 getActionDefinitionsBuilder(G_BSWAP)
1235 .legalFor({S16, S32, V2S16})
1236 .clampMaxNumElementsStrict(0, S16, 2)
1239 .widenScalarToNextPow2(0)
1240 .clampScalar(0, S16, S32)
1243 if (
ST.hasVOP3PInsts()) {
1244 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1245 .legalFor({S32, S16, V2S16})
1246 .clampMaxNumElements(0, S16, 2)
1248 .widenScalarToNextPow2(0)
1252 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1253 .legalFor({S32, S16})
1254 .widenScalarToNextPow2(0)
1261 getActionDefinitionsBuilder(G_BSWAP)
1266 .widenScalarToNextPow2(0)
1271 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1274 .widenScalarToNextPow2(0)
1279 getActionDefinitionsBuilder(G_INTTOPTR)
1281 .legalForCartesianProduct(AddrSpaces64, {S64})
1282 .legalForCartesianProduct(AddrSpaces32, {S32})
1295 getActionDefinitionsBuilder(G_PTRTOINT)
1297 .legalForCartesianProduct(AddrSpaces64, {S64})
1298 .legalForCartesianProduct(AddrSpaces32, {S32})
1311 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1315 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1316 bool IsLoad) ->
bool {
1320 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1334 unsigned NumRegs = (MemSize + 31) / 32;
1336 if (!
ST.hasDwordx3LoadStores())
1347 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1348 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1349 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1355 for (
unsigned Op : {G_LOAD, G_STORE}) {
1356 const bool IsStore =
Op == G_STORE;
1358 auto &Actions = getActionDefinitionsBuilder(
Op);
1361 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1362 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1363 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1364 {S64, GlobalPtr, S64, GlobalAlign32},
1365 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1366 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1367 {S32, GlobalPtr, S8, GlobalAlign8},
1368 {S32, GlobalPtr, S16, GlobalAlign16},
1370 {S32, LocalPtr, S32, 32},
1371 {S64, LocalPtr, S64, 32},
1372 {V2S32, LocalPtr, V2S32, 32},
1373 {S32, LocalPtr, S8, 8},
1374 {S32, LocalPtr, S16, 16},
1375 {V2S16, LocalPtr, S32, 32},
1377 {S32, PrivatePtr, S32, 32},
1378 {S32, PrivatePtr, S8, 8},
1379 {S32, PrivatePtr, S16, 16},
1380 {V2S16, PrivatePtr, S32, 32},
1382 {S32, ConstantPtr, S32, GlobalAlign32},
1383 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1384 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1385 {S64, ConstantPtr, S64, GlobalAlign32},
1386 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1395 Actions.unsupportedIf(
typeInSet(1, {BufferFatPtr, RsrcPtr}));
1409 Actions.customIf(
typeIs(1, Constant32Ptr));
1435 return !Query.
Types[0].isVector() &&
1436 needToSplitMemOp(Query,
Op == G_LOAD);
1438 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1443 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1446 if (DstSize > MemSize)
1452 if (MemSize > MaxSize)
1460 return Query.
Types[0].isVector() &&
1461 needToSplitMemOp(Query,
Op == G_LOAD);
1463 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1477 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1478 if (MemSize > MaxSize) {
1482 if (MaxSize % EltSize == 0) {
1488 unsigned NumPieces = MemSize / MaxSize;
1492 if (NumPieces == 1 || NumPieces >= NumElts ||
1493 NumElts % NumPieces != 0)
1494 return std::pair(0, EltTy);
1502 return std::pair(0, EltTy);
1517 return std::pair(0, EltTy);
1521 .widenScalarToNextPow2(0)
1527 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1528 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1529 {S32, GlobalPtr, S16, 2 * 8},
1530 {S32, LocalPtr, S8, 8},
1531 {S32, LocalPtr, S16, 16},
1532 {S32, PrivatePtr, S8, 8},
1533 {S32, PrivatePtr, S16, 16},
1534 {S32, ConstantPtr, S8, 8},
1535 {S32, ConstantPtr, S16, 2 * 8}})
1541 if (
ST.hasFlatAddressSpace()) {
1542 ExtLoads.legalForTypesWithMemDesc(
1543 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1551 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1553 ExtLoads.clampScalar(0, S32, S32)
1554 .widenScalarToNextPow2(0)
1557 auto &Atomics = getActionDefinitionsBuilder(
1558 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1559 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1560 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1561 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1562 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1563 {S64, GlobalPtr}, {S64, LocalPtr},
1564 {S32, RegionPtr}, {S64, RegionPtr}});
1565 if (
ST.hasFlatAddressSpace()) {
1566 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1569 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1570 if (
ST.hasLDSFPAtomicAdd()) {
1571 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1572 if (
ST.hasGFX90AInsts())
1573 Atomic.legalFor({{S64, LocalPtr}});
1574 if (
ST.hasAtomicDsPkAdd16Insts())
1575 Atomic.legalFor({{V2S16, LocalPtr}});
1577 if (
ST.hasAtomicFaddInsts())
1578 Atomic.legalFor({{S32, GlobalPtr}});
1579 if (
ST.hasFlatAtomicFaddF32Inst())
1580 Atomic.legalFor({{S32, FlatPtr}});
1582 if (
ST.hasGFX90AInsts()) {
1595 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1596 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1597 {S32, FlatPtr}, {S64, FlatPtr}})
1598 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1599 {S32, RegionPtr}, {S64, RegionPtr}});
1603 getActionDefinitionsBuilder(G_SELECT)
1604 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1605 LocalPtr, FlatPtr, PrivatePtr,
1609 .clampScalar(0, S16, S64)
1613 .clampMaxNumElements(0, S32, 2)
1614 .clampMaxNumElements(0, LocalPtr, 2)
1615 .clampMaxNumElements(0, PrivatePtr, 2)
1617 .widenScalarToNextPow2(0)
1622 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1623 .legalFor({{S32, S32}, {S64, S32}});
1624 if (
ST.has16BitInsts()) {
1625 if (
ST.hasVOP3PInsts()) {
1626 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1627 .clampMaxNumElements(0, S16, 2);
1629 Shifts.legalFor({{S16, S16}});
1632 Shifts.widenScalarIf(
1637 const LLT AmountTy = Query.
Types[1];
1641 Shifts.maxScalarIf(
typeIs(0, S16), 1, S16);
1642 Shifts.clampScalar(1, S32, S32);
1643 Shifts.widenScalarToNextPow2(0, 16);
1644 Shifts.clampScalar(0, S16, S64);
1646 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1654 Shifts.clampScalar(1, S32, S32);
1655 Shifts.widenScalarToNextPow2(0, 32);
1656 Shifts.clampScalar(0, S32, S64);
1658 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1663 Shifts.scalarize(0);
1665 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1666 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1667 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1668 unsigned IdxTypeIdx = 2;
1670 getActionDefinitionsBuilder(
Op)
1672 const LLT EltTy = Query.
Types[EltTypeIdx];
1673 const LLT VecTy = Query.
Types[VecTypeIdx];
1674 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1676 const bool isLegalVecType =
1686 return (EltSize == 32 || EltSize == 64) &&
1701 const LLT EltTy = Query.
Types[EltTypeIdx];
1702 const LLT VecTy = Query.
Types[VecTypeIdx];
1706 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1711 .clampScalar(EltTypeIdx, S32, S64)
1712 .clampScalar(VecTypeIdx, S32, S64)
1713 .clampScalar(IdxTypeIdx, S32, S32)
1714 .clampMaxNumElements(VecTypeIdx, S32, 32)
1724 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1726 const LLT &EltTy = Query.
Types[1].getElementType();
1727 return Query.
Types[0] != EltTy;
1730 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1731 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1732 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1735 getActionDefinitionsBuilder(
Op)
1741 const LLT BigTy = Query.
Types[BigTyIdx];
1746 const LLT BigTy = Query.
Types[BigTyIdx];
1747 const LLT LitTy = Query.
Types[LitTyIdx];
1753 const LLT BigTy = Query.
Types[BigTyIdx];
1759 const LLT LitTy = Query.
Types[LitTyIdx];
1764 .widenScalarToNextPow2(BigTyIdx, 32);
1768 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1769 .legalForCartesianProduct(AllS32Vectors, {S32})
1770 .legalForCartesianProduct(AllS64Vectors, {S64})
1771 .clampNumElements(0, V16S32, V32S32)
1772 .clampNumElements(0, V2S64, V16S64)
1778 if (
ST.hasScalarPackInsts()) {
1781 .minScalarOrElt(0, S16)
1784 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1785 .legalFor({V2S16, S32})
1788 BuildVector.customFor({V2S16, S16});
1789 BuildVector.minScalarOrElt(0, S32);
1791 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1792 .customFor({V2S16, S32})
1799 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1801 .clampMaxNumElements(0, S32, 32)
1802 .clampMaxNumElements(1, S16, 2)
1803 .clampMaxNumElements(0, S16, 64);
1805 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1808 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1809 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1810 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1812 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1813 const LLT Ty = Query.
Types[TypeIdx];
1824 auto &Builder = getActionDefinitionsBuilder(
Op)
1826 .lowerFor({{S16, V2S16}})
1828 const LLT BigTy = Query.
Types[BigTyIdx];
1834 .widenScalarToNextPow2(LitTyIdx, 16)
1842 .clampScalar(LitTyIdx, S32, S512)
1843 .widenScalarToNextPow2(LitTyIdx, 32)
1846 [=](
const LegalityQuery &Query) {
return notValidElt(Query, LitTyIdx); },
1849 [=](
const LegalityQuery &Query) {
return notValidElt(Query, BigTyIdx); },
1851 .clampScalar(BigTyIdx, S32, MaxScalar);
1853 if (
Op == G_MERGE_VALUES) {
1854 Builder.widenScalarIf(
1857 const LLT Ty = Query.
Types[LitTyIdx];
1863 Builder.widenScalarIf(
1865 const LLT Ty = Query.
Types[BigTyIdx];
1871 const LLT &Ty = Query.
Types[BigTyIdx];
1873 if (NewSizeInBits >= 256) {
1875 if (RoundedTo < NewSizeInBits)
1876 NewSizeInBits = RoundedTo;
1878 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
1887 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1888 .legalFor({{S32}, {S64}});
1890 if (
ST.hasVOP3PInsts()) {
1891 SextInReg.lowerFor({{V2S16}})
1895 .clampMaxNumElementsStrict(0, S16, 2);
1896 }
else if (
ST.has16BitInsts()) {
1897 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1901 SextInReg.lowerFor({{S32}, {S64}});
1906 .clampScalar(0, S32, S64)
1909 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1914 getActionDefinitionsBuilder(G_FSHR)
1915 .legalFor({{S32, S32}})
1916 .lowerFor({{V2S16, V2S16}})
1917 .clampMaxNumElementsStrict(0, S16, 2)
1921 if (
ST.hasVOP3PInsts()) {
1922 getActionDefinitionsBuilder(G_FSHL)
1923 .lowerFor({{V2S16, V2S16}})
1924 .clampMaxNumElementsStrict(0, S16, 2)
1928 getActionDefinitionsBuilder(G_FSHL)
1933 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1936 getActionDefinitionsBuilder(G_FENCE)
1939 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1944 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1945 .legalFor({{S32, S32}, {S64, S32}})
1946 .clampScalar(1, S32, S32)
1947 .clampScalar(0, S32, S64)
1948 .widenScalarToNextPow2(0)
1951 getActionDefinitionsBuilder({
1955 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1964 G_FMINIMUM, G_FMAXIMUM}).lower();
1966 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1969 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1970 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1971 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1974 getLegacyLegalizerInfo().computeTables();
1983 switch (
MI.getOpcode()) {
1984 case TargetOpcode::G_ADDRSPACE_CAST:
1986 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
1988 case TargetOpcode::G_FCEIL:
1990 case TargetOpcode::G_FREM:
1992 case TargetOpcode::G_INTRINSIC_TRUNC:
1994 case TargetOpcode::G_SITOFP:
1996 case TargetOpcode::G_UITOFP:
1998 case TargetOpcode::G_FPTOSI:
2000 case TargetOpcode::G_FPTOUI:
2002 case TargetOpcode::G_FMINNUM:
2003 case TargetOpcode::G_FMAXNUM:
2004 case TargetOpcode::G_FMINNUM_IEEE:
2005 case TargetOpcode::G_FMAXNUM_IEEE:
2007 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2009 case TargetOpcode::G_INSERT_VECTOR_ELT:
2011 case TargetOpcode::G_FSIN:
2012 case TargetOpcode::G_FCOS:
2014 case TargetOpcode::G_GLOBAL_VALUE:
2016 case TargetOpcode::G_LOAD:
2017 case TargetOpcode::G_SEXTLOAD:
2018 case TargetOpcode::G_ZEXTLOAD:
2020 case TargetOpcode::G_STORE:
2022 case TargetOpcode::G_FMAD:
2024 case TargetOpcode::G_FDIV:
2026 case TargetOpcode::G_FFREXP:
2028 case TargetOpcode::G_FSQRT:
2030 case TargetOpcode::G_UDIV:
2031 case TargetOpcode::G_UREM:
2032 case TargetOpcode::G_UDIVREM:
2034 case TargetOpcode::G_SDIV:
2035 case TargetOpcode::G_SREM:
2036 case TargetOpcode::G_SDIVREM:
2038 case TargetOpcode::G_ATOMIC_CMPXCHG:
2040 case TargetOpcode::G_FLOG2:
2042 case TargetOpcode::G_FLOG:
2043 case TargetOpcode::G_FLOG10:
2045 case TargetOpcode::G_FEXP2:
2047 case TargetOpcode::G_FEXP:
2049 case TargetOpcode::G_FPOW:
2051 case TargetOpcode::G_FFLOOR:
2053 case TargetOpcode::G_BUILD_VECTOR:
2054 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2056 case TargetOpcode::G_MUL:
2058 case TargetOpcode::G_CTLZ:
2059 case TargetOpcode::G_CTTZ:
2061 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2063 case TargetOpcode::G_STACKSAVE:
2083 if (ST.hasApertureRegs()) {
2088 ? AMDGPU::SRC_SHARED_BASE
2089 : AMDGPU::SRC_PRIVATE_BASE;
2097 Register Dst =
MRI.createGenericVirtualRegister(S64);
2098 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2099 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {
Register(ApertureRegNo)});
2100 return B.buildUnmerge(S32, Dst).getReg(1);
2105 Register LoadAddr =
MRI.createGenericVirtualRegister(
2115 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2117 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2131 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2134 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2137 Register QueuePtr =
MRI.createGenericVirtualRegister(
2153 B.buildPtrAdd(LoadAddr, QueuePtr,
2154 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2155 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2163 switch (Def->getOpcode()) {
2164 case AMDGPU::G_FRAME_INDEX:
2165 case AMDGPU::G_GLOBAL_VALUE:
2166 case AMDGPU::G_BLOCK_ADDR:
2168 case AMDGPU::G_CONSTANT: {
2169 const ConstantInt *CI = Def->getOperand(1).getCImm();
2188 LLT DstTy =
MRI.getType(Dst);
2189 LLT SrcTy =
MRI.getType(Src);
2200 if (
TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2201 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2210 B.buildExtract(Dst, Src, 0);
2211 MI.eraseFromParent();
2215 unsigned NullVal =
TM.getNullPointerValue(DestAS);
2217 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2218 auto FlatNull =
B.buildConstant(SrcTy, 0);
2221 auto PtrLo32 =
B.buildExtract(DstTy, Src, 0);
2225 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2227 MI.eraseFromParent();
2239 Register SrcAsInt =
B.buildPtrToInt(S32, Src).getReg(0);
2243 auto BuildPtr =
B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2246 B.buildCopy(Dst, BuildPtr);
2247 MI.eraseFromParent();
2251 auto SegmentNull =
B.buildConstant(SrcTy,
TM.getNullPointerValue(SrcAS));
2252 auto FlatNull =
B.buildConstant(DstTy,
TM.getNullPointerValue(DestAS));
2255 SegmentNull.getReg(0));
2257 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2259 MI.eraseFromParent();
2266 B.buildExtract(Dst, Src, 0);
2267 MI.eraseFromParent();
2275 auto PtrLo =
B.buildPtrToInt(S32, Src);
2276 auto HighAddr =
B.buildConstant(S32, AddrHiVal);
2277 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2278 MI.eraseFromParent();
2283 MF.
getFunction(),
"invalid addrspacecast",
B.getDebugLoc());
2286 Ctx.
diagnose(InvalidAddrSpaceCast);
2288 MI.eraseFromParent();
2296 LLT Ty =
MRI.getType(Src);
2302 auto C1 =
B.buildFConstant(Ty, C1Val);
2303 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2306 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2307 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2309 auto C2 =
B.buildFConstant(Ty, C2Val);
2310 auto Fabs =
B.buildFAbs(Ty, Src);
2313 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2314 MI.eraseFromParent();
2332 auto Trunc =
B.buildIntrinsicTrunc(S64, Src);
2334 const auto Zero =
B.buildFConstant(S64, 0.0);
2335 const auto One =
B.buildFConstant(S64, 1.0);
2338 auto And =
B.buildAnd(S1, Lt0, NeTrunc);
2339 auto Add =
B.buildSelect(S64,
And, One, Zero);
2342 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2343 MI.eraseFromParent();
2351 Register Src0Reg =
MI.getOperand(1).getReg();
2352 Register Src1Reg =
MI.getOperand(2).getReg();
2353 auto Flags =
MI.getFlags();
2354 LLT Ty =
MRI.getType(DstReg);
2356 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2357 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2358 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2359 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2360 MI.eraseFromParent();
2366 const unsigned FractBits = 52;
2367 const unsigned ExpBits = 11;
2370 auto Const0 =
B.buildConstant(S32, FractBits - 32);
2371 auto Const1 =
B.buildConstant(S32, ExpBits);
2373 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2375 .addUse(Const0.getReg(0))
2376 .addUse(Const1.getReg(0));
2378 return B.buildSub(S32, ExpPart,
B.buildConstant(S32, 1023));
2392 auto Unmerge =
B.buildUnmerge({S32, S32}, Src);
2399 const unsigned FractBits = 52;
2402 const auto SignBitMask =
B.buildConstant(S32, UINT32_C(1) << 31);
2403 auto SignBit =
B.buildAnd(S32,
Hi, SignBitMask);
2405 const auto FractMask =
B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2407 const auto Zero32 =
B.buildConstant(S32, 0);
2410 auto SignBit64 =
B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2412 auto Shr =
B.buildAShr(S64, FractMask, Exp);
2413 auto Not =
B.buildNot(S64, Shr);
2414 auto Tmp0 =
B.buildAnd(S64, Src, Not);
2415 auto FiftyOne =
B.buildConstant(S32, FractBits - 1);
2420 auto Tmp1 =
B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2421 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2422 MI.eraseFromParent();
2438 auto Unmerge =
B.buildUnmerge({S32, S32}, Src);
2439 auto ThirtyTwo =
B.buildConstant(S32, 32);
2441 if (
MRI.getType(Dst) == S64) {
2442 auto CvtHi =
Signed ?
B.buildSITOFP(S64, Unmerge.getReg(1))
2443 :
B.buildUITOFP(S64, Unmerge.getReg(1));
2445 auto CvtLo =
B.buildUITOFP(S64, Unmerge.getReg(0));
2446 auto LdExp =
B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2449 B.buildFAdd(Dst, LdExp, CvtLo);
2450 MI.eraseFromParent();
2456 auto One =
B.buildConstant(S32, 1);
2460 auto ThirtyOne =
B.buildConstant(S32, 31);
2461 auto X =
B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2462 auto OppositeSign =
B.buildAShr(S32,
X, ThirtyOne);
2463 auto MaxShAmt =
B.buildAdd(S32, ThirtyTwo, OppositeSign);
2464 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2465 .addUse(Unmerge.getReg(1));
2466 auto LS2 =
B.buildSub(S32, LS, One);
2467 ShAmt =
B.buildUMin(S32, LS2, MaxShAmt);
2469 ShAmt =
B.buildCTLZ(S32, Unmerge.getReg(1));
2470 auto Norm =
B.buildShl(S64, Src, ShAmt);
2471 auto Unmerge2 =
B.buildUnmerge({S32, S32}, Norm);
2472 auto Adjust =
B.buildUMin(S32, One, Unmerge2.getReg(0));
2473 auto Norm2 =
B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2474 auto FVal =
Signed ?
B.buildSITOFP(S32, Norm2) :
B.buildUITOFP(S32, Norm2);
2475 auto Scale =
B.buildSub(S32, ThirtyTwo, ShAmt);
2476 B.buildFLdexp(Dst, FVal, Scale);
2477 MI.eraseFromParent();
2494 const LLT SrcLT =
MRI.getType(Src);
2495 assert((SrcLT == S32 || SrcLT == S64) &&
MRI.getType(Dst) == S64);
2497 unsigned Flags =
MI.getFlags();
2508 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2510 if (
Signed && SrcLT == S32) {
2516 Sign =
B.buildAShr(S32, Src,
B.buildConstant(S32, 31));
2517 Trunc =
B.buildFAbs(S32, Trunc, Flags);
2521 K0 =
B.buildFConstant(
2522 S64, llvm::bit_cast<double>(UINT64_C( 0x3df0000000000000)));
2523 K1 =
B.buildFConstant(
2524 S64, llvm::bit_cast<double>(UINT64_C( 0xc1f0000000000000)));
2526 K0 =
B.buildFConstant(
2527 S32, llvm::bit_cast<float>(UINT32_C( 0x2f800000)));
2528 K1 =
B.buildFConstant(
2529 S32, llvm::bit_cast<float>(UINT32_C( 0xcf800000)));
2532 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2533 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2534 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2536 auto Hi = (
Signed && SrcLT == S64) ?
B.buildFPTOSI(S32, FloorMul)
2537 :
B.buildFPTOUI(S32, FloorMul);
2538 auto Lo =
B.buildFPTOUI(S32, Fma);
2540 if (
Signed && SrcLT == S32) {
2542 Sign =
B.buildMergeLikeInstr(S64, {Sign, Sign});
2544 B.buildSub(Dst,
B.buildXor(S64,
B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2547 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2548 MI.eraseFromParent();
2558 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2559 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2582 LLT VecTy =
MRI.getType(Vec);
2595 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2596 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2597 B.buildIntToPtr(Dst, IntElt);
2599 MI.eraseFromParent();
2606 std::optional<ValueAndVReg> MaybeIdxVal =
2610 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2613 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2614 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2619 MI.eraseFromParent();
2634 LLT VecTy =
MRI.getType(Vec);
2648 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2649 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2650 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2652 B.buildIntToPtr(Dst, IntVecDest);
2653 MI.eraseFromParent();
2660 std::optional<ValueAndVReg> MaybeIdxVal =
2665 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2668 if (IdxVal < NumElts) {
2670 for (
unsigned i = 0; i < NumElts; ++i)
2671 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2672 B.buildUnmerge(SrcRegs, Vec);
2674 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2675 B.buildMergeLikeInstr(Dst, SrcRegs);
2680 MI.eraseFromParent();
2690 LLT Ty =
MRI.getType(DstReg);
2691 unsigned Flags =
MI.getFlags();
2696 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2697 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2698 .addUse(MulVal.getReg(0))
2702 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2705 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2709 MI.eraseFromParent();
2717 unsigned GAFlags)
const {
2718 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
2746 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2757 if (!
B.getMRI()->getRegClassOrNull(PCReg))
2758 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2761 B.buildExtract(DstReg, PCReg, 0);
2775 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
2777 :
MRI.createGenericVirtualRegister(S32);
2779 if (!
MRI.getRegClassOrNull(AddrLo))
2780 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2783 B.buildInstr(AMDGPU::S_MOV_B32)
2788 if (RequiresHighHalf) {
2790 "Must provide a 64-bit pointer type!");
2792 Register AddrHi =
MRI.createGenericVirtualRegister(S32);
2793 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2795 B.buildInstr(AMDGPU::S_MOV_B32)
2805 if (!
MRI.getRegClassOrNull(AddrDst))
2806 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2808 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2812 if (AddrDst != DstReg)
2813 B.buildCast(DstReg, AddrDst);
2814 }
else if (AddrLo != DstReg) {
2817 B.buildCast(DstReg, AddrLo);
2825 LLT Ty =
MRI.getType(DstReg);
2837 Fn,
"local memory global used by non-kernel function",
MI.getDebugLoc(),
2847 B.buildUndef(DstReg);
2848 MI.eraseFromParent();
2868 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2872 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
2873 B.buildIntToPtr(DstReg, Sz);
2874 MI.eraseFromParent();
2880 *cast<GlobalVariable>(GV)));
2881 MI.eraseFromParent();
2887 MI.eraseFromParent();
2895 MI.eraseFromParent();
2901 MI.eraseFromParent();
2906 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
2919 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2920 B.buildExtract(DstReg, Load, 0);
2922 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2924 MI.eraseFromParent();
2942 LLT PtrTy =
MRI.getType(PtrReg);
2947 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
2949 MI.getOperand(1).setReg(Cast.getReg(0));
2954 if (
MI.getOpcode() != AMDGPU::G_LOAD)
2958 LLT ValTy =
MRI.getType(ValReg);
2980 if (WideMemSize == ValSize) {
2986 MI.setMemRefs(MF, {WideMMO});
2992 if (ValSize > WideMemSize)
2999 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3000 B.buildTrunc(ValReg, WideLoad).getReg(0);
3007 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3008 B.buildExtract(ValReg, WideLoad, 0);
3012 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3013 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3017 MI.eraseFromParent();
3030 Register DataReg =
MI.getOperand(0).getReg();
3031 LLT DataTy =
MRI.getType(DataReg);
3045 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3074 "this should not have been custom lowered");
3076 LLT ValTy =
MRI.getType(CmpVal);
3079 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3081 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3085 .setMemRefs(
MI.memoperands());
3087 MI.eraseFromParent();
3096 case TargetOpcode::G_INTRINSIC: {
3098 case Intrinsic::amdgcn_frexp_mant:
3106 case TargetOpcode::G_FFREXP: {
3111 case TargetOpcode::G_FPEXT: {
3135std::pair<Register, Register>
3137 unsigned Flags)
const {
3142 auto SmallestNormal =
B.buildFConstant(
3144 auto IsLtSmallestNormal =
3147 auto Scale32 =
B.buildFConstant(F32, 0x1.0p+32);
3148 auto One =
B.buildFConstant(F32, 1.0);
3150 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3151 auto ScaledInput =
B.buildFMul(F32, Src, ScaleFactor, Flags);
3153 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3166 LLT Ty =
B.getMRI()->getType(Dst);
3167 unsigned Flags =
MI.getFlags();
3172 auto Ext =
B.buildFPExt(F32, Src, Flags);
3173 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3174 .addUse(Ext.getReg(0))
3176 B.buildFPTrunc(Dst,
Log2, Flags);
3177 MI.eraseFromParent();
3185 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3188 MI.eraseFromParent();
3192 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3193 .addUse(ScaledInput)
3196 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3197 auto Zero =
B.buildFConstant(Ty, 0.0);
3199 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3200 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3202 MI.eraseFromParent();
3208 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3209 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3214 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3215 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3220 unsigned Flags =
MI.getFlags();
3221 const LLT Ty =
MRI.getType(
X);
3231 TM.Options.ApproxFuncFPMath ||
TM.Options.UnsafeFPMath) {
3233 Register LogVal =
MRI.createGenericVirtualRegister(F32);
3234 auto PromoteSrc =
B.buildFPExt(F32,
X);
3236 B.buildFPTrunc(Dst, LogVal);
3241 MI.eraseFromParent();
3250 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3255 const float c_log10 = 0x1.344134p-2f;
3256 const float cc_log10 = 0x1.09f79ep-26f;
3259 const float c_log = 0x1.62e42ep-1f;
3260 const float cc_log = 0x1.efa39ep-25f;
3262 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3263 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3265 R =
B.buildFMul(Ty,
Y,
C, Flags).getReg(0);
3266 auto NegR =
B.buildFNeg(Ty, R, Flags);
3267 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, Flags);
3268 auto FMA1 =
B.buildFMA(Ty,
Y,
CC, FMA0, Flags);
3269 R =
B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3272 const float ch_log10 = 0x1.344000p-2f;
3273 const float ct_log10 = 0x1.3509f6p-18f;
3276 const float ch_log = 0x1.62e000p-1f;
3277 const float ct_log = 0x1.0bfbe8p-15f;
3279 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3280 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3282 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3283 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3284 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3285 auto YTCT =
B.buildFMul(Ty, YT, CT, Flags);
3288 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3290 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, Flags);
3293 const bool IsFiniteOnly =
3297 if (!IsFiniteOnly) {
3300 auto Fabs =
B.buildFAbs(Ty,
Y);
3303 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3307 auto Zero =
B.buildFConstant(Ty, 0.0);
3309 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3310 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3311 B.buildFSub(Dst, R, Shift, Flags);
3313 B.buildCopy(Dst, R);
3316 MI.eraseFromParent();
3322 unsigned Flags)
const {
3323 const double Log2BaseInverted =
3326 LLT Ty =
B.getMRI()->getType(Dst);
3331 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3334 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3335 auto Zero =
B.buildFConstant(Ty, 0.0);
3337 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3338 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3341 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3343 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3344 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3352 ?
B.buildFLog2(Ty, Src, Flags)
3353 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3356 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3357 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3368 unsigned Flags =
MI.getFlags();
3369 LLT Ty =
B.getMRI()->getType(Dst);
3375 auto Ext =
B.buildFPExt(F32, Src, Flags);
3376 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3377 .addUse(Ext.getReg(0))
3379 B.buildFPTrunc(Dst,
Log2, Flags);
3380 MI.eraseFromParent();
3390 MI.eraseFromParent();
3398 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3400 RangeCheckConst, Flags);
3402 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3403 auto Zero =
B.buildFConstant(Ty, 0.0);
3404 auto AddOffset =
B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3405 auto AddInput =
B.buildFAdd(F32, Src, AddOffset, Flags);
3407 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3408 .addUse(AddInput.getReg(0))
3411 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3412 auto One =
B.buildFConstant(Ty, 1.0);
3413 auto ResultScale =
B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3414 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3415 MI.eraseFromParent();
3421 LLT Ty =
B.getMRI()->getType(Dst);
3426 auto Mul =
B.buildFMul(Ty,
X, Log2E, Flags);
3430 .addUse(
Mul.getReg(0))
3433 B.buildFExp2(Dst,
Mul.getReg(0), Flags);
3439 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3442 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3443 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3444 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3447 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3449 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3450 .addUse(ExpInput.getReg(0))
3453 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3454 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3455 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3463 const unsigned Flags =
MI.getFlags();
3466 LLT Ty =
MRI.getType(Dst);
3469 const bool IsExp10 =
false;
3476 MI.eraseFromParent();
3484 auto Ext =
B.buildFPExt(F32,
X, Flags);
3485 Register Lowered =
MRI.createGenericVirtualRegister(F32);
3487 B.buildFPTrunc(Dst, Lowered, Flags);
3488 MI.eraseFromParent();
3498 MI.eraseFromParent();
3526 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3531 const float cc_exp = 0x1.4ae0bep-26f;
3532 const float c_exp10 = 0x1.a934f0p+1f;
3533 const float cc_exp10 = 0x1.2f346ep-24f;
3535 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3536 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3537 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3538 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3540 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3541 PL =
B.buildFMA(Ty,
X,
CC, FMA0, Flags).getReg(0);
3543 const float ch_exp = 0x1.714000p+0f;
3544 const float cl_exp = 0x1.47652ap-12f;
3546 const float ch_exp10 = 0x1.a92000p+1f;
3547 const float cl_exp10 = 0x1.4f0978p-11f;
3549 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3550 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3551 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3553 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3554 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3556 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3557 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3560 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3561 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3564 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3567 auto PHSubE =
B.buildFSub(Ty, PH,
E, FlagsNoContract);
3568 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3571 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3572 .addUse(
A.getReg(0))
3574 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3576 auto UnderflowCheckConst =
3577 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3578 auto Zero =
B.buildFConstant(Ty, 0.0);
3582 R =
B.buildSelect(Ty, Underflow, Zero, R);
3587 auto OverflowCheckConst =
3588 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3593 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3596 B.buildCopy(Dst, R);
3597 MI.eraseFromParent();
3606 unsigned Flags =
MI.getFlags();
3607 LLT Ty =
B.getMRI()->getType(Dst);
3612 auto Log =
B.buildFLog2(F32, Src0, Flags);
3613 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3614 .addUse(Log.getReg(0))
3617 B.buildFExp2(Dst,
Mul, Flags);
3618 }
else if (Ty == F16) {
3620 auto Log =
B.buildFLog2(F16, Src0, Flags);
3621 auto Ext0 =
B.buildFPExt(F32, Log, Flags);
3622 auto Ext1 =
B.buildFPExt(F32, Src1, Flags);
3623 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3624 .addUse(Ext0.getReg(0))
3625 .addUse(Ext1.getReg(0))
3627 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
3631 MI.eraseFromParent();
3639 ModSrc = SrcFNeg->getOperand(1).getReg();
3641 ModSrc = SrcFAbs->getOperand(1).getReg();
3643 ModSrc = SrcFAbs->getOperand(1).getReg();
3654 Register OrigSrc =
MI.getOperand(1).getReg();
3655 unsigned Flags =
MI.getFlags();
3657 "this should not have been custom lowered");
3667 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3679 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3681 Register Min =
MRI.createGenericVirtualRegister(F64);
3687 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3689 B.buildFMinNum(Min, Fract, Const, Flags);
3694 CorrectedFract =
B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3697 auto NegFract =
B.buildFNeg(F64, CorrectedFract, Flags);
3698 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3700 MI.eraseFromParent();
3716 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3718 Src0 =
B.buildTrunc(S16,
MI.getOperand(1).getReg()).getReg(0);
3719 Src1 =
B.buildTrunc(S16,
MI.getOperand(2).getReg()).getReg(0);
3722 auto Merge =
B.buildMergeLikeInstr(S32, {Src0, Src1});
3723 B.buildBitcast(Dst,
Merge);
3725 MI.eraseFromParent();
3742 bool UsePartialMad64_32,
3743 bool SeparateOddAlignedProducts)
const {
3758 auto getZero32 = [&]() ->
Register {
3760 Zero32 =
B.buildConstant(S32, 0).getReg(0);
3763 auto getZero64 = [&]() ->
Register {
3765 Zero64 =
B.buildConstant(S64, 0).getReg(0);
3770 for (
unsigned i = 0; i < Src0.
size(); ++i) {
3781 if (CarryIn.empty())
3784 bool HaveCarryOut =
true;
3786 if (CarryIn.size() == 1) {
3788 LocalAccum =
B.buildZExt(S32, CarryIn[0]).getReg(0);
3792 CarryAccum = getZero32();
3794 CarryAccum =
B.buildZExt(S32, CarryIn[0]).getReg(0);
3795 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3797 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3802 LocalAccum = getZero32();
3803 HaveCarryOut =
false;
3808 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3809 LocalAccum =
Add.getReg(0);
3823 auto buildMadChain =
3826 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
3827 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
3834 if (LocalAccum.size() == 1 &&
3835 (!UsePartialMad64_32 || !CarryIn.empty())) {
3838 unsigned j1 = DstIndex - j0;
3839 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3843 auto Mul =
B.buildMul(S32, Src0[j0], Src1[j1]);
3845 LocalAccum[0] =
Mul.getReg(0);
3847 if (CarryIn.empty()) {
3848 LocalAccum[0] =
B.buildAdd(S32, LocalAccum[0],
Mul).getReg(0);
3851 B.buildUAdde(S32, S1, LocalAccum[0],
Mul, CarryIn.back())
3857 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3861 if (j0 <= DstIndex) {
3862 bool HaveSmallAccum =
false;
3865 if (LocalAccum[0]) {
3866 if (LocalAccum.size() == 1) {
3867 Tmp =
B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3868 HaveSmallAccum =
true;
3869 }
else if (LocalAccum[1]) {
3870 Tmp =
B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3871 HaveSmallAccum =
false;
3873 Tmp =
B.buildZExt(S64, LocalAccum[0]).getReg(0);
3874 HaveSmallAccum =
true;
3877 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3879 HaveSmallAccum =
true;
3883 unsigned j1 = DstIndex - j0;
3884 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3888 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3889 {Src0[j0], Src1[j1], Tmp});
3890 Tmp = Mad.getReg(0);
3891 if (!HaveSmallAccum)
3892 CarryOut.push_back(Mad.getReg(1));
3893 HaveSmallAccum =
false;
3896 }
while (j0 <= DstIndex);
3898 auto Unmerge =
B.buildUnmerge(S32, Tmp);
3899 LocalAccum[0] = Unmerge.getReg(0);
3900 if (LocalAccum.size() > 1)
3901 LocalAccum[1] = Unmerge.getReg(1);
3928 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
3929 Carry OddCarryIn = std::move(OddCarry);
3930 Carry EvenCarryIn = std::move(EvenCarry);
3935 if (2 * i < Accum.
size()) {
3936 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
3937 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3942 if (!SeparateOddAlignedProducts) {
3943 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
3944 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3946 bool IsHighest = 2 * i >= Accum.
size();
3950 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3956 Lo =
B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3958 Lo =
B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3960 Lo =
B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3963 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
3966 auto Hi =
B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3967 Lo->getOperand(1).getReg());
3968 Accum[2 * i] =
Hi.getReg(0);
3969 SeparateOddCarry =
Hi.getReg(1);
3976 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3977 EvenCarryIn.push_back(CarryOut);
3979 if (2 * i < Accum.
size()) {
3980 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3981 OddCarry.push_back(CarryOut);
3994 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4003 LLT Ty =
MRI.getType(DstReg);
4007 unsigned NumParts =
Size / 32;
4023 for (
unsigned i = 0; i < NumParts; ++i) {
4024 Src0Parts.
push_back(
MRI.createGenericVirtualRegister(S32));
4025 Src1Parts.
push_back(
MRI.createGenericVirtualRegister(S32));
4027 B.buildUnmerge(Src0Parts, Src0);
4028 B.buildUnmerge(Src1Parts, Src1);
4031 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4032 SeparateOddAlignedProducts);
4034 B.buildMergeLikeInstr(DstReg, AccumRegs);
4035 MI.eraseFromParent();
4047 LLT DstTy =
MRI.getType(Dst);
4048 LLT SrcTy =
MRI.getType(Src);
4050 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4051 ? AMDGPU::G_AMDGPU_FFBH_U32
4052 : AMDGPU::G_AMDGPU_FFBL_B32;
4053 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4056 MI.eraseFromParent();
4062 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4065 return ConstVal && *ConstVal == -1;
4072 Register CondDef =
MI.getOperand(0).getReg();
4073 if (!
MRI.hasOneNonDBGUse(CondDef))
4081 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4087 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4096 if (Next == Parent->
end()) {
4100 UncondBrTarget = &*NextMBB;
4102 if (Next->getOpcode() != AMDGPU::G_BR)
4120 *ArgRC,
B.getDebugLoc(), ArgTy);
4124 const unsigned Mask = Arg->
getMask();
4125 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4132 auto ShiftAmt =
B.buildConstant(S32, Shift);
4133 AndMaskSrc =
B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4136 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(S32, Mask >> Shift));
4138 B.buildCopy(DstReg, LiveIn);
4157 B.buildConstant(DstReg, 0);
4163 B.buildUndef(DstReg);
4167 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4178 MI.eraseFromParent();
4184 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4185 MI.eraseFromParent();
4206 B.buildUndef(DstReg);
4207 MI.eraseFromParent();
4211 if (Arg->isMasked()) {
4225 MI.eraseFromParent();
4232 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4242 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4250 Align Alignment)
const {
4254 "unexpected kernarg parameter type");
4258 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
4261 MI.eraseFromParent();
4269 LLT DstTy =
MRI.getType(Dst);
4296 auto FloatY =
B.buildUITOFP(S32,
Y);
4297 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4298 auto Scale =
B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4299 auto ScaledY =
B.buildFMul(S32, RcpIFlag, Scale);
4300 auto Z =
B.buildFPTOUI(S32, ScaledY);
4303 auto NegY =
B.buildSub(S32,
B.buildConstant(S32, 0),
Y);
4304 auto NegYZ =
B.buildMul(S32, NegY, Z);
4305 Z =
B.buildAdd(S32, Z,
B.buildUMulH(S32, Z, NegYZ));
4308 auto Q =
B.buildUMulH(S32,
X, Z);
4309 auto R =
B.buildSub(S32,
X,
B.buildMul(S32, Q,
Y));
4312 auto One =
B.buildConstant(S32, 1);
4315 Q =
B.buildSelect(S32,
Cond,
B.buildAdd(S32, Q, One), Q);
4316 R =
B.buildSelect(S32,
Cond,
B.buildSub(S32, R,
Y), R);
4321 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(S32, Q, One), Q);
4324 B.buildSelect(DstRemReg,
Cond,
B.buildSub(S32, R,
Y), R);
4343 auto Unmerge =
B.buildUnmerge(S32, Val);
4345 auto CvtLo =
B.buildUITOFP(S32, Unmerge.getReg(0));
4346 auto CvtHi =
B.buildUITOFP(S32, Unmerge.getReg(1));
4348 auto Mad =
B.buildFMAD(
4350 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4352 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4353 auto Mul1 =
B.buildFMul(
4354 S32, Rcp,
B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4357 auto Mul2 =
B.buildFMul(
4358 S32, Mul1,
B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4359 auto Trunc =
B.buildIntrinsicTrunc(S32, Mul2);
4362 auto Mad2 =
B.buildFMAD(
4363 S32, Trunc,
B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4366 auto ResultLo =
B.buildFPTOUI(S32, Mad2);
4367 auto ResultHi =
B.buildFPTOUI(S32, Trunc);
4369 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4384 auto Rcp =
B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4386 auto Zero64 =
B.buildConstant(S64, 0);
4387 auto NegDenom =
B.buildSub(S64, Zero64, Denom);
4389 auto MulLo1 =
B.buildMul(S64, NegDenom, Rcp);
4390 auto MulHi1 =
B.buildUMulH(S64, Rcp, MulLo1);
4392 auto UnmergeMulHi1 =
B.buildUnmerge(S32, MulHi1);
4393 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4394 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4396 auto Add1_Lo =
B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4397 auto Add1_Hi =
B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4398 auto Add1 =
B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4400 auto MulLo2 =
B.buildMul(S64, NegDenom, Add1);
4401 auto MulHi2 =
B.buildUMulH(S64, Add1, MulLo2);
4402 auto UnmergeMulHi2 =
B.buildUnmerge(S32, MulHi2);
4403 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4404 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4406 auto Zero32 =
B.buildConstant(S32, 0);
4407 auto Add2_Lo =
B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4408 auto Add2_Hi =
B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4409 auto Add2 =
B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4411 auto UnmergeNumer =
B.buildUnmerge(S32, Numer);
4412 Register NumerLo = UnmergeNumer.getReg(0);
4413 Register NumerHi = UnmergeNumer.getReg(1);
4415 auto MulHi3 =
B.buildUMulH(S64, Numer, Add2);
4416 auto Mul3 =
B.buildMul(S64, Denom, MulHi3);
4417 auto UnmergeMul3 =
B.buildUnmerge(S32, Mul3);
4418 Register Mul3_Lo = UnmergeMul3.getReg(0);
4419 Register Mul3_Hi = UnmergeMul3.getReg(1);
4420 auto Sub1_Lo =
B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4421 auto Sub1_Hi =
B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4422 auto Sub1_Mi =
B.buildSub(S32, NumerHi, Mul3_Hi);
4423 auto Sub1 =
B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4425 auto UnmergeDenom =
B.buildUnmerge(S32, Denom);
4426 Register DenomLo = UnmergeDenom.getReg(0);
4427 Register DenomHi = UnmergeDenom.getReg(1);
4430 auto C1 =
B.buildSExt(S32, CmpHi);
4433 auto C2 =
B.buildSExt(S32, CmpLo);
4436 auto C3 =
B.buildSelect(S32, CmpEq, C2, C1);
4443 auto Sub2_Lo =
B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4444 auto Sub2_Mi =
B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4445 auto Sub2_Hi =
B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4446 auto Sub2 =
B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4448 auto One64 =
B.buildConstant(S64, 1);
4449 auto Add3 =
B.buildAdd(S64, MulHi3, One64);
4455 auto C6 =
B.buildSelect(
4459 auto Add4 =
B.buildAdd(S64, Add3, One64);
4460 auto Sub3_Lo =
B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4462 auto Sub3_Mi =
B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4463 auto Sub3_Hi =
B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4464 auto Sub3 =
B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4470 auto Sel1 =
B.buildSelect(
4477 auto Sel2 =
B.buildSelect(
4488 switch (
MI.getOpcode()) {
4491 case AMDGPU::G_UDIV: {
4492 DstDivReg =
MI.getOperand(0).getReg();
4495 case AMDGPU::G_UREM: {
4496 DstRemReg =
MI.getOperand(0).getReg();
4499 case AMDGPU::G_UDIVREM: {
4500 DstDivReg =
MI.getOperand(0).getReg();
4501 DstRemReg =
MI.getOperand(1).getReg();
4508 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4509 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
4510 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
4511 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4520 MI.eraseFromParent();
4530 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4531 if (Ty != S32 && Ty != S64)
4534 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4538 auto SignBitOffset =
B.buildConstant(S32, Ty.
getSizeInBits() - 1);
4539 auto LHSign =
B.buildAShr(Ty,
LHS, SignBitOffset);
4540 auto RHSign =
B.buildAShr(Ty,
RHS, SignBitOffset);
4542 LHS =
B.buildAdd(Ty,
LHS, LHSign).getReg(0);
4543 RHS =
B.buildAdd(Ty,
RHS, RHSign).getReg(0);
4545 LHS =
B.buildXor(Ty,
LHS, LHSign).getReg(0);
4546 RHS =
B.buildXor(Ty,
RHS, RHSign).getReg(0);
4548 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4549 switch (
MI.getOpcode()) {
4552 case AMDGPU::G_SDIV: {
4553 DstDivReg =
MI.getOperand(0).getReg();
4554 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4557 case AMDGPU::G_SREM: {
4558 DstRemReg =
MI.getOperand(0).getReg();
4559 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4562 case AMDGPU::G_SDIVREM: {
4563 DstDivReg =
MI.getOperand(0).getReg();
4564 DstRemReg =
MI.getOperand(1).getReg();
4565 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4566 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4577 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
4578 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4579 B.buildSub(DstDivReg, SignXor, Sign);
4583 auto Sign = LHSign.getReg(0);
4584 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4585 B.buildSub(DstRemReg, SignXor, Sign);
4588 MI.eraseFromParent();
4599 LLT ResTy =
MRI.getType(Res);
4606 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
4617 if (CLHS->isExactlyValue(1.0)) {
4618 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4622 MI.eraseFromParent();
4627 if (CLHS->isExactlyValue(-1.0)) {
4628 auto FNeg =
B.buildFNeg(ResTy,
RHS, Flags);
4629 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4630 .addUse(FNeg.getReg(0))
4633 MI.eraseFromParent();
4640 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
4645 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4648 B.buildFMul(Res,
LHS, RCP, Flags);
4650 MI.eraseFromParent();
4661 LLT ResTy =
MRI.getType(Res);
4667 if (!AllowInaccurateRcp)
4670 auto NegY =
B.buildFNeg(ResTy,
Y);
4671 auto One =
B.buildFConstant(ResTy, 1.0);
4673 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4677 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
4678 R =
B.buildFMA(ResTy, Tmp0, R, R);
4680 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
4681 R =
B.buildFMA(ResTy, Tmp1, R, R);
4683 auto Ret =
B.buildFMul(ResTy,
X, R);
4684 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
4686 B.buildFMA(Res, Tmp2, R, Ret);
4687 MI.eraseFromParent();
4706 auto LHSExt =
B.buildFPExt(S32,
LHS, Flags);
4707 auto RHSExt =
B.buildFPExt(S32,
RHS, Flags);
4709 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4710 .addUse(RHSExt.getReg(0))
4713 auto QUOT =
B.buildFMul(S32, LHSExt, RCP, Flags);
4714 auto RDst =
B.buildFPTrunc(S16, QUOT, Flags);
4716 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4717 .addUse(RDst.getReg(0))
4722 MI.eraseFromParent();
4736 unsigned SPDenormMode =
4739 if (ST.hasDenormModeInst()) {
4741 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4743 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4744 B.buildInstr(AMDGPU::S_DENORM_MODE)
4745 .addImm(NewDenormModeValue);
4748 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4749 .addImm(SPDenormMode)
4771 auto One =
B.buildFConstant(S32, 1.0f);
4773 auto DenominatorScaled =
4774 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4779 auto NumeratorScaled =
4780 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4786 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4787 .addUse(DenominatorScaled.getReg(0))
4789 auto NegDivScale0 =
B.buildFNeg(S32, DenominatorScaled, Flags);
4792 const bool HasDynamicDenormals =
4797 if (!PreservesDenormals) {
4798 if (HasDynamicDenormals) {
4799 SavedSPDenormMode =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4800 B.buildInstr(AMDGPU::S_GETREG_B32)
4801 .addDef(SavedSPDenormMode)
4807 auto Fma0 =
B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4808 auto Fma1 =
B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4809 auto Mul =
B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4810 auto Fma2 =
B.buildFMA(S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
4811 auto Fma3 =
B.buildFMA(S32, Fma2, Fma1,
Mul, Flags);
4812 auto Fma4 =
B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4814 if (!PreservesDenormals) {
4815 if (HasDynamicDenormals) {
4816 assert(SavedSPDenormMode);
4817 B.buildInstr(AMDGPU::S_SETREG_B32)
4818 .addReg(SavedSPDenormMode)
4824 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
4825 .addUse(Fma4.getReg(0))
4826 .addUse(Fma1.getReg(0))
4827 .addUse(Fma3.getReg(0))
4828 .addUse(NumeratorScaled.getReg(1))
4831 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4832 .addUse(Fmas.getReg(0))
4837 MI.eraseFromParent();
4856 auto One =
B.buildFConstant(S64, 1.0);
4858 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4864 auto NegDivScale0 =
B.buildFNeg(S64, DivScale0.getReg(0), Flags);
4866 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
4867 .addUse(DivScale0.getReg(0))
4870 auto Fma0 =
B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4871 auto Fma1 =
B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4872 auto Fma2 =
B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4874 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4880 auto Fma3 =
B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4881 auto Mul =
B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4882 auto Fma4 =
B.buildFMA(S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
4891 auto NumUnmerge =
B.buildUnmerge(S32,
LHS);
4892 auto DenUnmerge =
B.buildUnmerge(S32,
RHS);
4893 auto Scale0Unmerge =
B.buildUnmerge(S32, DivScale0);
4894 auto Scale1Unmerge =
B.buildUnmerge(S32, DivScale1);