37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
40#define DEBUG_TYPE "amdgpu-legalinfo"
50 "amdgpu-global-isel-new-legality",
51 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
67 unsigned Bits = Ty.getSizeInBits();
77 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
91 const LLT Ty = Query.Types[TypeIdx];
98 const LLT Ty = Query.Types[TypeIdx];
100 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
106 const LLT Ty = Query.Types[TypeIdx];
108 return std::pair(TypeIdx,
115 const LLT Ty = Query.Types[TypeIdx];
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (
Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
129 const LLT Ty = Query.Types[TypeIdx];
132 const int Size = Ty.getSizeInBits();
134 const int NextMul32 = (
Size + 31) / 32;
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
154 const LLT Ty = Query.Types[TypeIdx];
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
159 assert(EltSize == 32 || EltSize == 64);
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
168 return std::pair(TypeIdx,
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
188 const unsigned Size = Ty.getSizeInBits();
201 const LLT Ty = Query.Types[TypeIdx];
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
218 const LLT QueryTy = Query.Types[TypeIdx];
225 const LLT QueryTy = Query.Types[TypeIdx];
232 const LLT QueryTy = Query.Types[TypeIdx];
238 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
244 return EltSize == 16 || EltSize % 32 == 0;
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
280 LLT Ty = Query.Types[TypeIdx];
288 const LLT QueryTy = Query.Types[TypeIdx];
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
378 (ST.useRealTrue16Insts() && Ty ==
S16) ||
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
413 bool IsLoad,
bool IsAtomic) {
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
430 return IsLoad ? 512 : 128;
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
444 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
446 unsigned RegSize = Ty.getSizeInBits();
449 unsigned AS = Query.
Types[1].getAddressSpace();
456 if (Ty.isVector() && MemSize !=
RegSize)
463 if (IsLoad && MemSize <
Size)
464 MemSize = std::max(MemSize,
Align);
484 if (!ST.hasDwordx3LoadStores())
497 if (AlignBits < MemSize) {
500 Align(AlignBits / 8)))
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
556 const unsigned Size = Ty.getSizeInBits();
557 if (
Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
564 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
573 uint64_t AlignInBits,
unsigned AddrSpace,
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
594 if (AlignInBits < RoundedSize)
601 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
613 Query.
Types[1].getAddressSpace(), Opcode);
633 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
639 for (
unsigned I = 0;
I < NumParts; ++
I)
641 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
647 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
648 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
669 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
670 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
671 for (
unsigned I = 0;
I < NumParts; ++
I)
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
675 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
695 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
708 const LLT BufferStridedPtr =
711 const LLT CodePtr = FlatPtr;
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
723 const std::initializer_list<LLT> FPTypesBase = {
727 const std::initializer_list<LLT> FPTypes16 = {
731 const std::initializer_list<LLT> FPTypesPK16 = {
735 const std::initializer_list<LLT> FPTypesPK16_64 = {
S32,
S64,
S16,
V2S16,
738 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
761 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
763 if (ST.hasPackedU64Ops()) {
766 .clampMaxNumElementsStrict(0,
S16, 2)
772 }
else if (ST.hasScalarAddSub64()) {
775 .clampMaxNumElementsStrict(0,
S16, 2)
783 .clampMaxNumElementsStrict(0,
S16, 2)
790 if (ST.hasScalarSMulU64()) {
793 .clampMaxNumElementsStrict(0,
S16, 2)
801 .clampMaxNumElementsStrict(0,
S16, 2)
811 .minScalarOrElt(0,
S16)
816 }
else if (ST.has16BitInsts()) {
850 .widenScalarToNextMultipleOf(0, 32)
860 if (ST.hasMad64_32())
865 if (ST.hasIntClamp()) {
888 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
898 if (ST.hasVOP3PInsts()) {
900 .clampMaxNumElements(0,
S8, 2)
921 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
933 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
940 .clampScalar(0,
S16,
S64);
973 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
974 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
981 if (ST.has16BitInsts()) {
982 if (ST.hasVOP3PInsts())
985 FPOpActions.legalFor({
S16});
987 TrigActions.customFor({
S16});
988 FDIVActions.customFor({
S16});
991 if (ST.hasPackedFP32Ops()) {
992 FPOpActions.legalFor({
V2S32});
993 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
996 if (ST.hasPackedFP64Ops()) {
997 FPOpActions.legalFor({
V2S64});
998 FPOpActions.clampMaxNumElementsStrict(0,
S64, 2);
1001 if (ST.hasPackedFP64Ops()) {
1002 FPOpActions.legalFor({
V2S64});
1003 FPOpActions.clampMaxNumElementsStrict(0,
S64, 2);
1006 auto &MinNumMaxNumIeee =
1009 if (ST.hasVOP3PInsts()) {
1010 MinNumMaxNumIeee.legalFor(FPTypesPK16)
1012 .clampMaxNumElements(0,
S16, 2)
1013 .clampScalar(0,
S16,
S64)
1015 }
else if (ST.has16BitInsts()) {
1016 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
1018 MinNumMaxNumIeee.legalFor(FPTypesBase)
1019 .clampScalar(0,
S32,
S64)
1024 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1026 if (ST.hasPackedFP64Ops()) {
1027 MinNumMaxNum.customFor(FPTypesPK16_64)
1029 .clampMaxNumElements(0,
S16, 2)
1030 .clampMaxNumElements(0,
S64, 2)
1031 .clampScalar(0,
S16,
S64)
1033 }
else if (ST.hasVOP3PInsts()) {
1034 MinNumMaxNum.customFor(FPTypesPK16)
1036 .clampMaxNumElements(0,
S16, 2)
1037 .clampScalar(0,
S16,
S64)
1039 }
else if (ST.has16BitInsts()) {
1040 MinNumMaxNum.customFor(FPTypes16)
1041 .clampScalar(0,
S16,
S64)
1044 MinNumMaxNum.customFor(FPTypesBase)
1045 .clampScalar(0,
S32,
S64)
1049 if (ST.hasVOP3PInsts())
1066 .
legalFor(ST.hasPackedFP32Ops(), {V2S32})
1068 if (ST.hasPackedFP32Ops())
1072 if (ST.has16BitInsts()) {
1106 if (ST.hasFractBug()) {
1140 if (ST.hasCvtPkF16F32Inst()) {
1142 .clampMaxNumElements(0,
S16, 2);
1146 FPTruncActions.scalarize(0).lower();
1154 if (ST.has16BitInsts()) {
1168 if (ST.hasPackedFP32Ops())
1178 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1179 FMad.customFor({
S32,
S16});
1180 else if (ST.hasMadMacF32Insts())
1181 FMad.customFor({
S32});
1182 else if (ST.hasMadF16())
1183 FMad.customFor({
S16});
1188 if (ST.has16BitInsts()) {
1191 FRem.minScalar(0,
S32)
1200 .clampMaxNumElements(0,
S16, 2)
1219 if (ST.has16BitInsts())
1230 if (ST.has16BitInsts())
1243 .legalFor(ST.has16BitInsts(),{{S16, S16}})
1247 if (
ST.has16BitInsts())
1257 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1258 .clampScalar(0,
S16,
S64)
1262 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1268 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1272 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1273 .clampScalar(0,
S16,
S64)
1277 if (
ST.has16BitInsts()) {
1278 getActionDefinitionsBuilder(
1279 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1281 .clampScalar(0,
S16,
S64)
1284 getActionDefinitionsBuilder(
1285 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1287 .clampScalar(0,
S32,
S64)
1290 getActionDefinitionsBuilder(
1291 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1294 .clampScalar(0,
S32,
S64)
1298 getActionDefinitionsBuilder(G_PTR_ADD)
1304 getActionDefinitionsBuilder(G_PTRMASK)
1306 .scalarSameSizeAs(1, 0)
1310 getActionDefinitionsBuilder(G_ICMP)
1322 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1323 .legalForCartesianProduct(
1324 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1325 if (
ST.has16BitInsts()) {
1326 CmpBuilder.legalFor({{
S1,
S16}});
1337 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1339 if (
ST.hasSALUFloatInsts())
1348 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1349 if (
ST.has16BitInsts())
1350 ExpOps.customFor({{
S32}, {
S16}});
1352 ExpOps.customFor({
S32});
1353 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1356 getActionDefinitionsBuilder(G_FPOWI)
1357 .clampScalar(0, MinScalarFPTy,
S32)
1360 getActionDefinitionsBuilder(G_FLOG2)
1361 .legalFor(
ST.has16BitInsts(), {S16})
1366 getActionDefinitionsBuilder(G_FEXP2)
1367 .legalFor(
ST.has16BitInsts(), {S16})
1373 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1375 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1379 getActionDefinitionsBuilder(G_CTPOP)
1381 .clampScalar(0,
S32,
S32)
1382 .widenScalarToNextPow2(1, 32)
1383 .clampScalar(1,
S32,
S64)
1385 .widenScalarToNextPow2(0, 32);
1388 if (
ST.has16BitInsts())
1389 getActionDefinitionsBuilder(G_IS_FPCLASS)
1390 .legalForCartesianProduct({
S1}, FPTypes16)
1391 .widenScalarToNextPow2(1)
1395 getActionDefinitionsBuilder(G_IS_FPCLASS)
1396 .legalForCartesianProduct({
S1}, FPTypesBase)
1397 .lowerFor({
S1,
S16})
1398 .widenScalarToNextPow2(1)
1405 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1407 .clampScalar(0,
S32,
S32)
1408 .clampScalar(1,
S32,
S64)
1409 .widenScalarToNextPow2(0, 32)
1410 .widenScalarToNextPow2(1, 32)
1414 getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON)
1417 .clampScalar(0,
S32,
S32)
1418 .clampScalar(1,
S32,
S64)
1420 .widenScalarToNextPow2(0, 32)
1421 .widenScalarToNextPow2(1, 32);
1423 getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON)
1425 .clampScalar(0,
S32,
S32)
1426 .clampScalar(1,
S32,
S64)
1428 .widenScalarToNextPow2(0, 32)
1429 .widenScalarToNextPow2(1, 32);
1431 getActionDefinitionsBuilder(G_CTLS)
1434 .clampScalar(0,
S32,
S32)
1435 .clampScalar(1,
S32,
S32);
1439 getActionDefinitionsBuilder(G_BITREVERSE)
1441 .clampScalar(0,
S32,
S64)
1443 .widenScalarToNextPow2(0);
1445 if (
ST.has16BitInsts()) {
1446 getActionDefinitionsBuilder(G_BSWAP)
1448 .clampMaxNumElementsStrict(0,
S16, 2)
1451 .widenScalarToNextPow2(0)
1452 .clampScalar(0,
S16,
S32)
1455 if (
ST.hasVOP3PInsts()) {
1456 getActionDefinitionsBuilder(G_ABS)
1458 .clampMaxNumElements(0,
S16, 2)
1460 .widenScalarToNextPow2(0)
1463 if (
ST.hasMinMaxI64Insts()) {
1464 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1466 .clampMaxNumElements(0,
S16, 2)
1468 .widenScalarToNextPow2(0)
1472 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1474 .clampMaxNumElements(0,
S16, 2)
1476 .widenScalarToNextPow2(0)
1481 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1483 .widenScalarToNextPow2(0)
1490 getActionDefinitionsBuilder(G_BSWAP)
1495 .widenScalarToNextPow2(0)
1500 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1503 .widenScalarToNextPow2(0)
1508 getActionDefinitionsBuilder(G_INTTOPTR)
1510 .legalForCartesianProduct(AddrSpaces64, {
S64})
1511 .legalForCartesianProduct(AddrSpaces32, {
S32})
1524 getActionDefinitionsBuilder(G_PTRTOINT)
1526 .legalForCartesianProduct(AddrSpaces64, {
S64})
1527 .legalForCartesianProduct(AddrSpaces32, {
S32})
1540 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1544 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1545 bool IsLoad) ->
bool {
1549 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1563 unsigned NumRegs = (MemSize + 31) / 32;
1565 if (!
ST.hasDwordx3LoadStores())
1576 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1577 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1578 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1584 for (
unsigned Op : {G_LOAD, G_STORE}) {
1585 const bool IsStore =
Op == G_STORE;
1587 auto &Actions = getActionDefinitionsBuilder(
Op);
1590 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1593 {
S64, GlobalPtr,
S64, GlobalAlign32},
1596 {
S32, GlobalPtr,
S8, GlobalAlign8},
1597 {
S32, GlobalPtr,
S16, GlobalAlign16},
1599 {
S32, LocalPtr,
S32, 32},
1600 {
S64, LocalPtr,
S64, 32},
1602 {
S32, LocalPtr,
S8, 8},
1603 {
S32, LocalPtr,
S16, 16},
1606 {
S32, PrivatePtr,
S32, 32},
1607 {
S32, PrivatePtr,
S8, 8},
1608 {
S32, PrivatePtr,
S16, 16},
1611 {
S32, ConstantPtr,
S32, GlobalAlign32},
1614 {
S64, ConstantPtr,
S64, GlobalAlign32},
1615 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1617 Actions.legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1618 {{S16, GlobalPtr, S8, GlobalAlign8},
1619 {S16, GlobalPtr, S16, GlobalAlign16},
1620 {S16, LocalPtr, S8, 8},
1621 {S16, LocalPtr, S16, 16},
1622 {S16, PrivatePtr, S8, 8},
1623 {S16, PrivatePtr, S16, 16}});
1633 Actions.unsupportedIf(
1634 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1648 Actions.customIf(
typeIs(1, Constant32Ptr));
1674 return !Query.
Types[0].isVector() &&
1675 needToSplitMemOp(Query,
Op == G_LOAD);
1677 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1682 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1685 if (DstSize > MemSize)
1691 if (MemSize > MaxSize)
1699 return Query.
Types[0].isVector() &&
1700 needToSplitMemOp(Query,
Op == G_LOAD);
1702 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1716 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1717 if (MemSize > MaxSize) {
1721 if (MaxSize % EltSize == 0) {
1727 unsigned NumPieces = MemSize / MaxSize;
1731 if (NumPieces == 1 || NumPieces >= NumElts ||
1732 NumElts % NumPieces != 0)
1733 return std::pair(0, EltTy);
1741 return std::pair(0, EltTy);
1756 return std::pair(0, EltTy);
1761 .widenScalarToNextPow2(0)
1768 getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1769 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1770 {
S32, GlobalPtr,
S16, 2 * 8},
1771 {
S32, LocalPtr,
S8, 8},
1772 {
S32, LocalPtr,
S16, 16},
1773 {
S32, PrivatePtr,
S8, 8},
1774 {
S32, PrivatePtr,
S16, 16},
1775 {
S32, ConstantPtr,
S8, 8},
1776 {
S32, ConstantPtr,
S16, 2 * 8}})
1777 .legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1778 {{S16, GlobalPtr, S8, GlobalAlign8},
1779 {S16, LocalPtr, S8, GlobalAlign8},
1780 {S16, PrivatePtr, S8, GlobalAlign8},
1781 {S16, ConstantPtr, S8, GlobalAlign8}})
1786 if (
ST.hasFlatAddressSpace()) {
1787 ExtLoads.legalForTypesWithMemDesc(
1788 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1790 ExtLoads.legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1791 {{S16, FlatPtr, S8, GlobalAlign8}});
1799 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1801 ExtLoads.narrowScalarIf(
1808 ExtLoads.clampScalar(0,
S32,
S32)
1809 .widenScalarToNextPow2(0)
1812 auto &Atomics = getActionDefinitionsBuilder(
1813 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1814 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1815 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1816 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1817 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1818 {
S64, GlobalPtr}, {
S64, LocalPtr},
1819 {
S32, RegionPtr}, {
S64, RegionPtr}});
1820 if (
ST.hasFlatAddressSpace()) {
1821 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1825 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1826 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr}, {
S32, RegionPtr}});
1827 if (
ST.hasFlatAddressSpace()) {
1828 Atomics32.legalFor({{
S32, FlatPtr}});
1832 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1833 if (
ST.hasLDSFPAtomicAddF32()) {
1834 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1835 if (
ST.hasLdsAtomicAddF64())
1836 Atomic.legalFor({{
S64, LocalPtr}});
1837 if (
ST.hasAtomicDsPkAdd16Insts())
1838 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1840 if (
ST.hasAtomicFaddInsts())
1841 Atomic.legalFor({{
S32, GlobalPtr}});
1842 if (
ST.hasFlatAtomicFaddF32Inst())
1843 Atomic.legalFor({{
S32, FlatPtr}});
1845 if (
ST.hasGFX90AInsts() ||
ST.hasGFX1250Insts()) {
1856 if (
ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1857 ST.hasAtomicBufferGlobalPkAddF16Insts())
1858 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1859 if (
ST.hasAtomicGlobalPkAddBF16Inst())
1860 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1861 if (
ST.hasAtomicFlatPkAdd16Insts())
1862 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1867 auto &AtomicFMinFMax =
1868 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1869 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1871 if (
ST.hasAtomicFMinFMaxF32GlobalInsts())
1872 AtomicFMinFMax.legalFor({{
F32, GlobalPtr},{
F32, BufferFatPtr}});
1873 if (
ST.hasAtomicFMinFMaxF64GlobalInsts())
1874 AtomicFMinFMax.legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1875 if (
ST.hasAtomicFMinFMaxF32FlatInsts())
1876 AtomicFMinFMax.legalFor({
F32, FlatPtr});
1877 if (
ST.hasAtomicFMinFMaxF64FlatInsts())
1878 AtomicFMinFMax.legalFor({
F64, FlatPtr});
1882 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1883 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1884 {
S32, FlatPtr}, {
S64, FlatPtr}})
1885 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1886 {
S32, RegionPtr}, {
S64, RegionPtr}});
1890 getActionDefinitionsBuilder(G_SELECT)
1892 LocalPtr, FlatPtr, PrivatePtr,
1896 .clampScalar(0,
S16,
S64)
1900 .clampMaxNumElements(0,
S32, 2)
1901 .clampMaxNumElements(0, LocalPtr, 2)
1902 .clampMaxNumElements(0, PrivatePtr, 2)
1904 .widenScalarToNextPow2(0)
1909 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1911 if (
ST.has16BitInsts()) {
1912 if (
ST.hasVOP3PInsts()) {
1914 .clampMaxNumElements(0,
S16, 2);
1916 Shifts.legalFor({{
S16,
S16}});
1919 Shifts.widenScalarIf(
1924 const LLT AmountTy = Query.
Types[1];
1929 Shifts.clampScalar(1,
S32,
S32);
1930 Shifts.widenScalarToNextPow2(0, 16);
1931 Shifts.clampScalar(0,
S16,
S64);
1933 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1941 Shifts.clampScalar(1,
S32,
S32);
1942 Shifts.widenScalarToNextPow2(0, 32);
1943 Shifts.clampScalar(0,
S32,
S64);
1945 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1950 Shifts.scalarize(0);
1952 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1953 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1954 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1955 unsigned IdxTypeIdx = 2;
1957 getActionDefinitionsBuilder(
Op)
1959 const LLT EltTy = Query.
Types[EltTypeIdx];
1960 const LLT VecTy = Query.
Types[VecTypeIdx];
1961 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1963 const bool isLegalVecType =
1973 return (EltSize == 32 || EltSize == 64) &&
1989 const LLT EltTy = Query.
Types[EltTypeIdx];
1990 const LLT VecTy = Query.
Types[VecTypeIdx];
1994 const unsigned TargetEltSize =
1995 DstEltSize % 64 == 0 ? 64 : 32;
1996 return std::pair(VecTypeIdx,
2000 .clampScalar(EltTypeIdx,
S32,
S64)
2001 .clampScalar(VecTypeIdx,
S32,
S64)
2002 .clampScalar(IdxTypeIdx,
S32,
S32)
2003 .clampMaxNumElements(VecTypeIdx,
S32, 32)
2012 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
2014 const LLT &EltTy = Query.
Types[1].getElementType();
2015 return Query.
Types[0] != EltTy;
2018 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
2019 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
2020 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
2021 getActionDefinitionsBuilder(
Op)
2024 const LLT BigTy = Query.
Types[BigTyIdx];
2030 const LLT LitTy = Query.
Types[LitTyIdx];
2035 .widenScalarToNextPow2(BigTyIdx, 32)
2043 const LLT BigTy = Query.
Types[BigTyIdx];
2044 const LLT LitTy = Query.
Types[LitTyIdx];
2052 getActionDefinitionsBuilder(G_BUILD_VECTOR)
2061 if (
ST.hasScalarPackInsts()) {
2064 .minScalarOrElt(0,
S16)
2067 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2071 BuildVector.customFor({
V2S16,
S16});
2072 BuildVector.minScalarOrElt(0,
S32);
2074 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2082 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2084 .clampMaxNumElements(0,
S32, 32)
2085 .clampMaxNumElements(1,
S16, 2)
2086 .clampMaxNumElements(0,
S16, 64);
2088 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2091 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2092 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
2093 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
2095 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
2096 const LLT Ty = Query.
Types[TypeIdx];
2108 getActionDefinitionsBuilder(
Op)
2112 const LLT BigTy = Query.
Types[BigTyIdx];
2118 .widenScalarToNextPow2(LitTyIdx, 16)
2127 .clampScalar(LitTyIdx,
S32,
S512)
2128 .widenScalarToNextPow2(LitTyIdx, 32)
2132 return notValidElt(Query, LitTyIdx);
2137 return notValidElt(Query, BigTyIdx);
2142 if (
Op == G_MERGE_VALUES) {
2143 Builder.widenScalarIf(
2146 const LLT Ty = Query.
Types[LitTyIdx];
2152 Builder.widenScalarIf(
2154 const LLT Ty = Query.
Types[BigTyIdx];
2160 const LLT &Ty = Query.
Types[BigTyIdx];
2162 if (NewSizeInBits >= 256) {
2164 if (RoundedTo < NewSizeInBits)
2165 NewSizeInBits = RoundedTo;
2167 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2176 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2177 .legalFor({{
S32}, {
S64}})
2178 .clampScalar(0,
S32,
S64);
2180 if (
ST.hasVOP3PInsts()) {
2181 SextInReg.lowerFor({{
V2S16}})
2185 .clampMaxNumElementsStrict(0,
S16, 2);
2186 }
else if (
ST.has16BitInsts()) {
2187 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2191 SextInReg.lowerFor({{
S32}, {
S64}});
2196 .clampScalar(0,
S32,
S64)
2199 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2203 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2204 FSHRActionDefs.legalFor({{
S32,
S32}})
2205 .clampMaxNumElementsStrict(0,
S16, 2);
2206 if (
ST.hasVOP3PInsts())
2208 FSHRActionDefs.scalarize(0).lower();
2210 if (
ST.hasVOP3PInsts()) {
2211 getActionDefinitionsBuilder(G_FSHL)
2213 .clampMaxNumElementsStrict(0,
S16, 2)
2217 getActionDefinitionsBuilder(G_FSHL)
2222 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2225 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
2227 getActionDefinitionsBuilder(G_FENCE)
2230 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2235 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2237 .clampScalar(1,
S32,
S32)
2238 .clampScalar(0,
S32,
S64)
2239 .widenScalarToNextPow2(0)
2242 getActionDefinitionsBuilder(
2246 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2247 G_READ_REGISTER, G_WRITE_REGISTER,
2252 if (
ST.hasIEEEMinimumMaximumInsts()) {
2253 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2254 .legalFor(FPTypesPK16)
2255 .clampMaxNumElements(0,
S16, 2)
2257 }
else if (
ST.hasVOP3PInsts()) {
2258 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2260 .clampMaxNumElementsStrict(0,
S16, 2)
2264 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2266 .clampScalar(0,
S32,
S64)
2270 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2273 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2275 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2276 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2277 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2280 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2282 getActionDefinitionsBuilder(
2283 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2284 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2285 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2286 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2291 getActionDefinitionsBuilder({G_INTRINSIC, G_INTRINSIC_W_SIDE_EFFECTS,
2292 G_INTRINSIC_CONVERGENT,
2293 G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS})
2296 getLegacyLegalizerInfo().computeTables();
2306 switch (
MI.getOpcode()) {
2307 case TargetOpcode::G_ADDRSPACE_CAST:
2309 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2311 case TargetOpcode::G_FCEIL:
2313 case TargetOpcode::G_FREM:
2315 case TargetOpcode::G_INTRINSIC_TRUNC:
2317 case TargetOpcode::G_SITOFP:
2319 case TargetOpcode::G_UITOFP:
2321 case TargetOpcode::G_FPTOSI:
2323 case TargetOpcode::G_FPTOUI:
2325 case TargetOpcode::G_FMINNUM:
2326 case TargetOpcode::G_FMAXNUM:
2327 case TargetOpcode::G_FMINIMUMNUM:
2328 case TargetOpcode::G_FMAXIMUMNUM:
2330 case TargetOpcode::G_EXTRACT:
2332 case TargetOpcode::G_INSERT:
2334 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2336 case TargetOpcode::G_INSERT_VECTOR_ELT:
2338 case TargetOpcode::G_FSIN:
2339 case TargetOpcode::G_FCOS:
2341 case TargetOpcode::G_GLOBAL_VALUE:
2343 case TargetOpcode::G_LOAD:
2344 case TargetOpcode::G_SEXTLOAD:
2345 case TargetOpcode::G_ZEXTLOAD:
2347 case TargetOpcode::G_STORE:
2349 case TargetOpcode::G_FMAD:
2351 case TargetOpcode::G_FDIV:
2353 case TargetOpcode::G_FFREXP:
2355 case TargetOpcode::G_FSQRT:
2357 case TargetOpcode::G_UDIV:
2358 case TargetOpcode::G_UREM:
2359 case TargetOpcode::G_UDIVREM:
2361 case TargetOpcode::G_SDIV:
2362 case TargetOpcode::G_SREM:
2363 case TargetOpcode::G_SDIVREM:
2365 case TargetOpcode::G_ATOMIC_CMPXCHG:
2367 case TargetOpcode::G_FLOG2:
2369 case TargetOpcode::G_FLOG:
2370 case TargetOpcode::G_FLOG10:
2372 case TargetOpcode::G_FEXP2:
2374 case TargetOpcode::G_FEXP:
2375 case TargetOpcode::G_FEXP10:
2377 case TargetOpcode::G_FPOW:
2379 case TargetOpcode::G_FFLOOR:
2381 case TargetOpcode::G_BUILD_VECTOR:
2382 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2384 case TargetOpcode::G_MUL:
2386 case TargetOpcode::G_CTLZ:
2387 case TargetOpcode::G_CTTZ:
2389 case TargetOpcode::G_CTLS:
2391 case TargetOpcode::G_CTLZ_ZERO_POISON:
2393 case TargetOpcode::G_STACKSAVE:
2395 case TargetOpcode::G_GET_FPENV:
2397 case TargetOpcode::G_SET_FPENV:
2399 case TargetOpcode::G_TRAP:
2401 case TargetOpcode::G_DEBUGTRAP:
2421 if (ST.hasApertureRegs()) {
2426 ? AMDGPU::SRC_SHARED_BASE
2427 : AMDGPU::SRC_PRIVATE_BASE;
2428 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2429 !ST.hasGloballyAddressableScratch()) &&
2430 "Cannot use src_private_base with globally addressable scratch!");
2433 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2434 return B.buildUnmerge(
S32, Dst).getReg(1);
2449 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2465 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2468 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2490 B.buildObjectPtrOffset(
2492 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2493 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2501 switch (Def->getOpcode()) {
2502 case AMDGPU::G_FRAME_INDEX:
2503 case AMDGPU::G_GLOBAL_VALUE:
2504 case AMDGPU::G_BLOCK_ADDR:
2506 case AMDGPU::G_CONSTANT: {
2507 const ConstantInt *CI = Def->getOperand(1).getCImm();
2524 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2526 Intrinsic::amdgcn_addrspacecast_nonnull));
2531 :
MI.getOperand(1).getReg();
2535 unsigned SrcAS = SrcTy.getAddressSpace();
2545 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2552 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2554 ST.hasGloballyAddressableScratch()) {
2558 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2560 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2561 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2563 MRI.
setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2565 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2569 return B.buildExtract(Dst, Src, 0).getReg(0);
2575 castFlatToLocalOrPrivate(Dst);
2576 MI.eraseFromParent();
2582 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2583 auto FlatNull =
B.buildConstant(SrcTy, 0);
2586 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2590 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2592 MI.eraseFromParent();
2599 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2602 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2605 ST.hasGloballyAddressableScratch()) {
2610 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2614 if (ST.isWave64()) {
2615 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2621 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2622 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2624 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2628 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2629 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2631 MRI.
setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2632 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2641 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2647 castLocalOrPrivateToFlat(Dst);
2648 MI.eraseFromParent();
2652 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2659 SegmentNull.getReg(0));
2661 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2663 MI.eraseFromParent();
2668 SrcTy.getSizeInBits() == 64) {
2670 B.buildExtract(Dst, Src, 0);
2671 MI.eraseFromParent();
2678 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2679 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2680 if (AddrHiVal == 0) {
2682 B.buildIntToPtr(Dst, Zext);
2684 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2685 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2688 MI.eraseFromParent();
2695 MI.eraseFromParent();
2704 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2709 auto C1 =
B.buildFConstant(Ty, C1Val);
2710 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2713 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2714 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2716 auto C2 =
B.buildFConstant(Ty, C2Val);
2717 auto Fabs =
B.buildFAbs(Ty, Src);
2720 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2721 MI.eraseFromParent();
2739 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2741 const auto Zero =
B.buildFConstant(
S64, 0.0);
2742 const auto One =
B.buildFConstant(
S64, 1.0);
2745 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2746 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2749 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2750 MI.eraseFromParent();
2758 Register Src0Reg =
MI.getOperand(1).getReg();
2759 Register Src1Reg =
MI.getOperand(2).getReg();
2760 auto Flags =
MI.getFlags();
2763 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2764 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2765 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2766 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2767 MI.eraseFromParent();
2773 const unsigned FractBits = 52;
2774 const unsigned ExpBits = 11;
2777 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2778 auto Const1 =
B.buildConstant(
S32, ExpBits);
2780 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2782 .addUse(Const0.getReg(0))
2783 .addUse(Const1.getReg(0));
2785 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2799 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2806 const unsigned FractBits = 52;
2809 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2810 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2812 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2814 const auto Zero32 =
B.buildConstant(
S32, 0);
2817 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2819 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2820 auto Not =
B.buildNot(
S64, Shr);
2821 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2822 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2827 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2828 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2829 MI.eraseFromParent();
2845 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2846 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2849 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2850 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2852 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2853 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2856 B.buildFAdd(Dst, LdExp, CvtLo);
2857 MI.eraseFromParent();
2863 auto One =
B.buildConstant(
S32, 1);
2867 auto ThirtyOne =
B.buildConstant(
S32, 31);
2868 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2869 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2870 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2871 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2872 .addUse(Unmerge.getReg(1));
2873 auto LS2 =
B.buildSub(
S32, LS, One);
2874 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2876 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2877 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2878 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2879 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2880 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2881 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2882 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2883 B.buildFLdexp(Dst, FVal, Scale);
2884 MI.eraseFromParent();
2904 unsigned Flags =
MI.getFlags();
2915 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2923 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2924 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2928 K0 =
B.buildFConstant(
2930 K1 =
B.buildFConstant(
2933 K0 =
B.buildFConstant(
2935 K1 =
B.buildFConstant(
2939 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2940 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2941 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2944 :
B.buildFPTOUI(
S32, FloorMul);
2945 auto Lo =
B.buildFPTOUI(
S32, Fma);
2949 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2951 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2954 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2955 MI.eraseFromParent();
2987 unsigned StartIdx =
Offset / 32;
2989 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32), SrcReg);
2991 if (DstCount == 1) {
2993 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2998 for (
unsigned I = 0;
I < DstCount; ++
I)
2999 MergeVec.
push_back(Unmerge.getReg(StartIdx +
I));
3000 B.buildMergeLikeInstr(DstReg, MergeVec);
3003 MI.eraseFromParent();
3013 Register InsertSrc =
MI.getOperand(2).getReg();
3022 if (
Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
3026 unsigned DstCount = DstSize / 32;
3027 unsigned InsertCount = InsertSize / 32;
3028 unsigned StartIdx =
Offset / 32;
3030 auto SrcUnmerge =
B.buildUnmerge(
S32, SrcReg);
3033 for (
unsigned I = 0;
I < StartIdx; ++
I)
3036 if (InsertCount == 1) {
3040 InsertSrc =
B.buildPtrToInt(
S32, InsertSrc).getReg(0);
3043 auto InsertUnmerge =
B.buildUnmerge(
S32, InsertSrc);
3044 for (
unsigned I = 0;
I < InsertCount; ++
I)
3048 for (
unsigned I = StartIdx + InsertCount;
I < DstCount; ++
I)
3051 B.buildMergeLikeInstr(DstReg, MergeVec);
3053 MI.eraseFromParent();
3080 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
3081 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
3082 B.buildIntToPtr(Dst, IntElt);
3084 MI.eraseFromParent();
3091 std::optional<ValueAndVReg> MaybeIdxVal =
3095 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3098 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
3099 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3104 MI.eraseFromParent();
3133 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
3134 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
3135 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3137 B.buildIntToPtr(Dst, IntVecDest);
3138 MI.eraseFromParent();
3145 std::optional<ValueAndVReg> MaybeIdxVal =
3150 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3153 if (IdxVal < NumElts) {
3155 for (
unsigned i = 0; i < NumElts; ++i)
3157 B.buildUnmerge(SrcRegs, Vec);
3159 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
3160 B.buildMergeLikeInstr(Dst, SrcRegs);
3165 MI.eraseFromParent();
3176 unsigned Flags =
MI.getFlags();
3180 if (ST.hasTrigReducedRange()) {
3181 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3182 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3183 .addUse(MulVal.getReg(0))
3187 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3190 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3194 MI.eraseFromParent();
3202 unsigned GAFlags)
const {
3231 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3233 if (ST.has64BitLiterals()) {
3237 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3241 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3250 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3251 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3254 B.buildExtract(DstReg, PCReg, 0);
3264 if (RequiresHighHalf && ST.has64BitLiterals()) {
3266 MRI.
setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3267 B.buildInstr(AMDGPU::S_MOV_B64)
3282 MRI.
setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3285 B.buildInstr(AMDGPU::S_MOV_B32)
3290 if (RequiresHighHalf) {
3292 "Must provide a 64-bit pointer type!");
3295 MRI.
setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3297 B.buildInstr(AMDGPU::S_MOV_B32)
3308 MRI.
setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3310 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3314 if (AddrDst != DstReg)
3315 B.buildCast(DstReg, AddrDst);
3316 }
else if (AddrLo != DstReg) {
3319 B.buildCast(DstReg, AddrLo);
3328 unsigned AS = Ty.getAddressSpace();
3336 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3340 Fn,
"local memory global used by non-kernel function",
3349 B.buildUndef(DstReg);
3350 MI.eraseFromParent();
3374 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3375 B.buildIntToPtr(DstReg, Sz);
3376 MI.eraseFromParent();
3382 MI.eraseFromParent();
3386 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3388 MI.eraseFromParent();
3396 MI.eraseFromParent();
3402 MI.eraseFromParent();
3418 if (Ty.getSizeInBits() == 32) {
3420 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3421 B.buildExtract(DstReg, Load, 0);
3423 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3425 MI.eraseFromParent();
3448 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3450 MI.getOperand(1).setReg(Cast.getReg(0));
3455 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3481 if (WideMemSize == ValSize) {
3487 MI.setMemRefs(MF, {WideMMO});
3493 if (ValSize > WideMemSize)
3500 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3501 B.buildTrunc(ValReg, WideLoad).getReg(0);
3508 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3509 B.buildExtract(ValReg, WideLoad, 0);
3513 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3514 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3518 MI.eraseFromParent();
3531 Register DataReg =
MI.getOperand(0).getReg();
3576 "this should not have been custom lowered");
3581 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3583 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3587 .setMemRefs(
MI.memoperands());
3589 MI.eraseFromParent();
3597 switch (
DefMI->getOpcode()) {
3598 case TargetOpcode::G_INTRINSIC: {
3600 case Intrinsic::amdgcn_frexp_mant:
3601 case Intrinsic::amdgcn_log:
3602 case Intrinsic::amdgcn_log_clamp:
3603 case Intrinsic::amdgcn_exp2:
3604 case Intrinsic::amdgcn_sqrt:
3612 case TargetOpcode::G_FSQRT:
3614 case TargetOpcode::G_FFREXP: {
3615 if (
DefMI->getOperand(0).getReg() == Src)
3619 case TargetOpcode::G_FPEXT: {
3640std::pair<Register, Register>
3642 unsigned Flags)
const {
3647 auto SmallestNormal =
B.buildFConstant(
3649 auto IsLtSmallestNormal =
3652 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3653 auto One =
B.buildFConstant(
F32, 1.0);
3655 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3656 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3658 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3671 LLT Ty =
B.getMRI()->getType(Dst);
3672 unsigned Flags =
MI.getFlags();
3677 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3678 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3679 .addUse(Ext.getReg(0))
3681 B.buildFPTrunc(Dst,
Log2, Flags);
3682 MI.eraseFromParent();
3690 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3693 MI.eraseFromParent();
3697 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3698 .addUse(ScaledInput)
3701 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3702 auto Zero =
B.buildFConstant(Ty, 0.0);
3704 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3705 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3707 MI.eraseFromParent();
3713 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3714 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3719 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3720 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3725 unsigned Flags =
MI.getFlags();
3738 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3740 B.buildFPTrunc(Dst, LogVal);
3745 MI.eraseFromParent();
3754 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3757 if (ST.hasFastFMAF32()) {
3759 const float c_log10 = 0x1.344134p-2f;
3760 const float cc_log10 = 0x1.09f79ep-26f;
3763 const float c_log = 0x1.62e42ep-1f;
3764 const float cc_log = 0x1.efa39ep-25f;
3766 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3767 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3771 R =
B.buildFMul(Ty,
Y,
C, NewFlags).getReg(0);
3772 auto NegR =
B.buildFNeg(Ty, R, NewFlags);
3773 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, NewFlags);
3774 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, NewFlags);
3775 R =
B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3778 const float ch_log10 = 0x1.344000p-2f;
3779 const float ct_log10 = 0x1.3509f6p-18f;
3782 const float ch_log = 0x1.62e000p-1f;
3783 const float ct_log = 0x1.0bfbe8p-15f;
3785 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3786 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3788 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3789 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3790 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3794 auto YTCT =
B.buildFMul(Ty, YT, CT, NewFlags);
3797 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3799 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, NewFlags);
3802 const bool IsFiniteOnly =
3805 if (!IsFiniteOnly) {
3808 auto Fabs =
B.buildFAbs(Ty,
Y);
3811 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3815 auto Zero =
B.buildFConstant(Ty, 0.0);
3817 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3818 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3819 B.buildFSub(Dst, R, Shift, Flags);
3821 B.buildCopy(Dst, R);
3824 MI.eraseFromParent();
3830 unsigned Flags)
const {
3831 const double Log2BaseInverted =
3834 LLT Ty =
B.getMRI()->getType(Dst);
3839 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3842 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3843 auto Zero =
B.buildFConstant(Ty, 0.0);
3845 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3846 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3848 if (ST.hasFastFMAF32())
3849 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3851 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3852 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3860 ?
B.buildFLog2(Ty, Src, Flags)
3861 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3864 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3865 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3876 unsigned Flags =
MI.getFlags();
3877 LLT Ty =
B.getMRI()->getType(Dst);
3887 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3888 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3889 .addUse(Ext.getReg(0))
3891 B.buildFPTrunc(Dst,
Log2, Flags);
3892 MI.eraseFromParent();
3902 MI.eraseFromParent();
3910 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3912 RangeCheckConst, Flags);
3914 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3915 auto Zero =
B.buildFConstant(Ty, 0.0);
3916 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3917 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3919 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3920 .addUse(AddInput.getReg(0))
3923 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3924 auto One =
B.buildFConstant(Ty, 1.0);
3925 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3926 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3927 MI.eraseFromParent();
3932 const SrcOp &Src,
unsigned Flags) {
3933 LLT Ty = Dst.getLLTTy(*
B.getMRI());
3936 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3937 .addUse(Src.getReg())
3940 return B.buildFExp2(Dst, Src, Flags);
3946 bool IsExp10)
const {
3947 LLT Ty =
B.getMRI()->getType(
X);
3951 auto Const =
B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f :
numbers::log2e);
3952 auto Mul =
B.buildFMul(Ty,
X, Const, Flags);
3959 LLT Ty =
B.getMRI()->getType(Dst);
3966 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3969 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3970 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3971 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3974 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3976 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3977 .addUse(ExpInput.getReg(0))
3980 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3981 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3982 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3988 unsigned Flags)
const {
3989 LLT Ty =
B.getMRI()->getType(Dst);
3994 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3995 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3997 auto Mul1 =
B.buildFMul(Ty,
X, K1, Flags);
3998 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3999 auto Mul0 =
B.buildFMul(Ty,
X, K0, Flags);
4000 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
4001 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
4011 auto Threshold =
B.buildFConstant(Ty, -0x1.2f7030p+5f);
4015 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+5f);
4016 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
4017 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X);
4019 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
4020 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
4022 auto Mul1 =
B.buildFMul(Ty, AdjustedX, K1, Flags);
4023 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
4024 auto Mul0 =
B.buildFMul(Ty, AdjustedX, K0, Flags);
4025 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
4027 auto MulExps =
B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
4028 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.9f623ep-107f);
4029 auto AdjustedResult =
B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
4031 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
4050 if (
MI.getOpcode() == TargetOpcode::G_FEXP2) {
4052 Dn =
B.buildFRint(
S64,
X, Flags).getReg(0);
4054 F =
B.buildFSub(
S64,
X, Dn, Flags).getReg(0);
4056 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4057 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4058 auto Mul2 =
B.buildFMul(
S64,
F, C2, Flags).getReg(0);
4059 T =
B.buildFMA(
S64,
F, C1, Mul2, Flags).getReg(0);
4061 }
else if (
MI.getOpcode() == TargetOpcode::G_FEXP10) {
4062 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.a934f0979a371p+1));
4063 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4064 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4066 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4067 auto C2 =
B.buildFConstant(
S64,
APFloat(-0x1.9dc1da994fd21p-59));
4068 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.34413509f79ffp-2));
4069 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4070 F =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4072 auto C4 =
B.buildFConstant(
S64,
APFloat(0x1.26bb1bbb55516p+1));
4073 auto C5 =
B.buildFConstant(
S64,
APFloat(-0x1.f48ad494ea3e9p-53));
4074 auto MulF =
B.buildFMul(
S64,
F, C5, Flags).getReg(0);
4075 T =
B.buildFMA(
S64,
F, C4, MulF, Flags).getReg(0);
4078 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.71547652b82fep+0));
4079 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4080 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4082 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4083 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4084 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4085 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4086 T =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4090 auto P =
B.buildFConstant(
S64, 0x1.ade156a5dcb37p-26);
4091 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.28af3fca7ab0cp-22),
4093 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.71dee623fde64p-19),
4095 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01997c89e6b0p-16),
4097 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01a014761f6ep-13),
4099 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.6c16c1852b7b0p-10),
4101 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.1111111122322p-7), Flags);
4102 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.55555555502a1p-5), Flags);
4103 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.5555555555511p-3), Flags);
4104 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.000000000000bp-1), Flags);
4106 auto One =
B.buildFConstant(
S64, 1.0);
4107 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4108 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4111 auto DnInt =
B.buildFPTOSI(
S32, Dn);
4112 auto Z =
B.buildFLdexp(
S64,
P, DnInt, Flags);
4119 Z =
B.buildSelect(
S64, CondHi, Z, PInf, Flags);
4126 B.buildSelect(
MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4128 MI.eraseFromParent();
4136 const unsigned Flags =
MI.getFlags();
4148 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
4156 MI.eraseFromParent();
4167 auto Ext =
B.buildFPExt(
F32,
X, Flags);
4170 B.buildFPTrunc(Dst, Lowered, Flags);
4171 MI.eraseFromParent();
4182 MI.eraseFromParent();
4210 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
4213 if (ST.hasFastFMAF32()) {
4215 const float cc_exp = 0x1.4ae0bep-26f;
4216 const float c_exp10 = 0x1.a934f0p+1f;
4217 const float cc_exp10 = 0x1.2f346ep-24f;
4219 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4220 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
4221 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
4222 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
4224 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4225 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
4227 const float ch_exp = 0x1.714000p+0f;
4228 const float cl_exp = 0x1.47652ap-12f;
4230 const float ch_exp10 = 0x1.a92000p+1f;
4231 const float cl_exp10 = 0x1.4f0978p-11f;
4233 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
4234 auto XH =
B.buildAnd(Ty,
X, MaskConst);
4235 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
4237 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4238 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
4240 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4241 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
4244 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
4245 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4248 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
4251 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
4252 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
4255 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4256 .addUse(
A.getReg(0))
4258 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
4260 auto UnderflowCheckConst =
4261 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4262 auto Zero =
B.buildFConstant(Ty, 0.0);
4266 R =
B.buildSelect(Ty, Underflow, Zero, R);
4269 auto OverflowCheckConst =
4270 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4275 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
4278 B.buildCopy(Dst, R);
4279 MI.eraseFromParent();
4288 unsigned Flags =
MI.getFlags();
4289 LLT Ty =
B.getMRI()->getType(Dst);
4294 auto Log =
B.buildFLog2(
F32, Src0, Flags);
4295 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4296 .addUse(Log.getReg(0))
4299 B.buildFExp2(Dst,
Mul, Flags);
4300 }
else if (Ty == F16) {
4302 auto Log =
B.buildFLog2(F16, Src0, Flags);
4303 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
4304 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
4305 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4306 .addUse(Ext0.getReg(0))
4307 .addUse(Ext1.getReg(0))
4309 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
4313 MI.eraseFromParent();
4321 ModSrc = SrcFNeg->getOperand(1).getReg();
4323 ModSrc = SrcFAbs->getOperand(1).getReg();
4325 ModSrc = SrcFAbs->getOperand(1).getReg();
4336 Register OrigSrc =
MI.getOperand(1).getReg();
4337 unsigned Flags =
MI.getFlags();
4339 "this should not have been custom lowered");
4349 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
4369 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4371 B.buildFMinNum(Min, Fract, Const, Flags);
4376 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
4379 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
4380 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4382 MI.eraseFromParent();
4398 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4400 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4401 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4404 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4405 B.buildBitcast(Dst,
Merge);
4407 MI.eraseFromParent();
4424 bool UsePartialMad64_32,
4425 bool SeparateOddAlignedProducts)
const {
4440 auto getZero32 = [&]() ->
Register {
4442 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4445 auto getZero64 = [&]() ->
Register {
4447 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4452 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4463 if (CarryIn.empty())
4466 bool HaveCarryOut =
true;
4468 if (CarryIn.size() == 1) {
4470 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4474 CarryAccum = getZero32();
4476 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4477 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4479 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4484 LocalAccum = getZero32();
4485 HaveCarryOut =
false;
4490 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4491 LocalAccum =
Add.getReg(0);
4505 auto buildMadChain =
4508 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4509 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4516 if (LocalAccum.size() == 1 &&
4517 (!UsePartialMad64_32 || !CarryIn.empty())) {
4520 unsigned j1 = DstIndex - j0;
4521 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4525 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4527 LocalAccum[0] =
Mul.getReg(0);
4529 if (CarryIn.empty()) {
4530 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4533 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4539 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4543 if (j0 <= DstIndex) {
4544 bool HaveSmallAccum =
false;
4547 if (LocalAccum[0]) {
4548 if (LocalAccum.size() == 1) {
4549 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4550 HaveSmallAccum =
true;
4551 }
else if (LocalAccum[1]) {
4552 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4553 HaveSmallAccum =
false;
4555 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4556 HaveSmallAccum =
true;
4559 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4561 HaveSmallAccum =
true;
4565 unsigned j1 = DstIndex - j0;
4566 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4570 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4571 {Src0[j0], Src1[j1], Tmp});
4572 Tmp = Mad.getReg(0);
4573 if (!HaveSmallAccum)
4574 CarryOut.push_back(Mad.getReg(1));
4575 HaveSmallAccum =
false;
4578 }
while (j0 <= DstIndex);
4580 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4581 LocalAccum[0] = Unmerge.getReg(0);
4582 if (LocalAccum.size() > 1)
4583 LocalAccum[1] = Unmerge.getReg(1);
4610 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4611 Carry OddCarryIn = std::move(OddCarry);
4612 Carry EvenCarryIn = std::move(EvenCarry);
4617 if (2 * i < Accum.
size()) {
4618 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4619 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4624 if (!SeparateOddAlignedProducts) {
4625 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4626 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4628 bool IsHighest = 2 * i >= Accum.
size();
4631 .take_front(IsHighest ? 1 : 2);
4632 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4638 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4640 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4642 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4645 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4648 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4649 Lo->getOperand(1).getReg());
4650 Accum[2 * i] =
Hi.getReg(0);
4651 SeparateOddCarry =
Hi.getReg(1);
4658 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4659 EvenCarryIn.push_back(CarryOut);
4661 if (2 * i < Accum.
size()) {
4662 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4663 OddCarry.push_back(CarryOut);
4675 assert(ST.hasMad64_32());
4676 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4688 unsigned Size = Ty.getSizeInBits();
4689 if (ST.hasVMulU64Inst() &&
Size == 64)
4692 unsigned NumParts =
Size / 32;
4704 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4708 for (
unsigned i = 0; i < NumParts; ++i) {
4712 B.buildUnmerge(Src0Parts, Src0);
4713 B.buildUnmerge(Src1Parts, Src1);
4716 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4717 SeparateOddAlignedProducts);
4719 B.buildMergeLikeInstr(DstReg, AccumRegs);
4720 MI.eraseFromParent();
4735 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4736 ? AMDGPU::G_AMDGPU_FFBH_U32
4737 : AMDGPU::G_AMDGPU_FFBL_B32;
4738 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4741 MI.eraseFromParent();
4751 TypeSize NumBits = SrcTy.getSizeInBits();
4755 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4756 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4757 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4758 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4759 B.buildTrunc(Dst, Ctlz);
4760 MI.eraseFromParent();
4771 assert(SrcTy ==
S32 &&
"legalizeCTLS only supports s32");
4772 unsigned BitWidth = SrcTy.getSizeInBits();
4774 auto Sffbh =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32}).addUse(Src);
4776 B.buildSub(Dst, Clamped,
B.buildConstant(
S32, 1));
4777 MI.eraseFromParent();
4783 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4786 return ConstVal == -1;
4793 Register CondDef =
MI.getOperand(0).getReg();
4812 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4821 UncondBrTarget = &*NextMBB;
4823 if (
Next->getOpcode() != AMDGPU::G_BR)
4842 *ArgRC,
B.getDebugLoc(), ArgTy);
4846 const unsigned Mask = Arg->
getMask();
4854 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4855 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4858 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4860 B.buildCopy(DstReg, LiveIn);
4870 if (!ST.hasClusters()) {
4873 MI.eraseFromParent();
4893 auto One =
B.buildConstant(
S32, 1);
4894 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4895 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4896 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4903 B.buildCopy(DstReg, GlobalIdXYZ);
4904 MI.eraseFromParent();
4908 B.buildCopy(DstReg, ClusterIdXYZ);
4909 MI.eraseFromParent();
4914 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4916 MRI.
setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4917 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4919 .addImm(ClusterIdField);
4920 auto Zero =
B.buildConstant(
S32, 0);
4923 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4924 MI.eraseFromParent();
4966 auto LoadConstant = [&](
unsigned N) {
4967 B.buildConstant(DstReg,
N);
4971 if (ST.hasArchitectedSGPRs() &&
4978 Arg = &WorkGroupIDX;
4979 ArgRC = &AMDGPU::SReg_32RegClass;
4983 Arg = &WorkGroupIDY;
4984 ArgRC = &AMDGPU::SReg_32RegClass;
4988 Arg = &WorkGroupIDZ;
4989 ArgRC = &AMDGPU::SReg_32RegClass;
4993 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4994 return LoadConstant(0);
4995 Arg = &ClusterWorkGroupIDX;
4996 ArgRC = &AMDGPU::SReg_32RegClass;
5000 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
5001 return LoadConstant(0);
5002 Arg = &ClusterWorkGroupIDY;
5003 ArgRC = &AMDGPU::SReg_32RegClass;
5007 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
5008 return LoadConstant(0);
5009 Arg = &ClusterWorkGroupIDZ;
5010 ArgRC = &AMDGPU::SReg_32RegClass;
5015 return LoadConstant(ClusterDims.
getDims()[0] - 1);
5016 Arg = &ClusterWorkGroupMaxIDX;
5017 ArgRC = &AMDGPU::SReg_32RegClass;
5022 return LoadConstant(ClusterDims.
getDims()[1] - 1);
5023 Arg = &ClusterWorkGroupMaxIDY;
5024 ArgRC = &AMDGPU::SReg_32RegClass;
5029 return LoadConstant(ClusterDims.
getDims()[2] - 1);
5030 Arg = &ClusterWorkGroupMaxIDZ;
5031 ArgRC = &AMDGPU::SReg_32RegClass;
5035 Arg = &ClusterWorkGroupMaxFlatID;
5036 ArgRC = &AMDGPU::SReg_32RegClass;
5051 return LoadConstant(0);
5056 B.buildUndef(DstReg);
5060 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5072 MI.eraseFromParent();
5078 B.buildConstant(
MI.getOperand(0).getReg(),
C);
5079 MI.eraseFromParent();
5086 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
5100 B.buildUndef(DstReg);
5101 MI.eraseFromParent();
5105 if (Arg->isMasked()) {
5119 MI.eraseFromParent();
5134 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
5143 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5151 Align Alignment)
const {
5155 "unexpected kernarg parameter type");
5162 MI.eraseFromParent();
5197 auto FloatY =
B.buildUITOFP(
S32,
Y);
5198 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
5200 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
5201 auto Z =
B.buildFPTOUI(
S32, ScaledY);
5204 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
5205 auto NegYZ =
B.buildMul(
S32, NegY, Z);
5206 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
5209 auto Q =
B.buildUMulH(
S32,
X, Z);
5210 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
5213 auto One =
B.buildConstant(
S32, 1);
5216 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
5222 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
5225 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
5244 auto Unmerge =
B.buildUnmerge(
S32, Val);
5246 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
5247 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
5249 auto Mad =
B.buildFMAD(
5253 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
5254 auto Mul1 =
B.buildFMul(
5258 auto Mul2 =
B.buildFMul(
5260 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
5263 auto Mad2 =
B.buildFMAD(
5267 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
5268 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
5270 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5285 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
5287 auto Zero64 =
B.buildConstant(
S64, 0);
5288 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
5290 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
5291 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
5293 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
5294 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5295 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5297 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
5298 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5299 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
5301 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
5302 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
5303 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
5304 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5305 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5307 auto Zero32 =
B.buildConstant(
S32, 0);
5308 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
5309 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5310 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
5312 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
5313 Register NumerLo = UnmergeNumer.getReg(0);
5314 Register NumerHi = UnmergeNumer.getReg(1);
5316 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
5317 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
5318 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
5319 Register Mul3_Lo = UnmergeMul3.getReg(0);
5320 Register Mul3_Hi = UnmergeMul3.getReg(1);
5321 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
5322 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5323 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
5324 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
5326 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
5327 Register DenomLo = UnmergeDenom.getReg(0);
5328 Register DenomHi = UnmergeDenom.getReg(1);
5331 auto C1 =
B.buildSExt(
S32, CmpHi);
5334 auto C2 =
B.buildSExt(
S32, CmpLo);
5337 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
5344 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
5345 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5346 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5347 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
5349 auto One64 =
B.buildConstant(
S64, 1);
5350 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
5356 auto C6 =
B.buildSelect(
5360 auto Add4 =
B.buildAdd(
S64, Add3, One64);
5361 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
5363 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5364 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5365 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
5371 auto Sel1 =
B.buildSelect(
5378 auto Sel2 =
B.buildSelect(
5389 switch (
MI.getOpcode()) {
5392 case AMDGPU::G_UDIV: {
5393 DstDivReg =
MI.getOperand(0).getReg();
5396 case AMDGPU::G_UREM: {
5397 DstRemReg =
MI.getOperand(0).getReg();
5400 case AMDGPU::G_UDIVREM: {
5401 DstDivReg =
MI.getOperand(0).getReg();
5402 DstRemReg =
MI.getOperand(1).getReg();
5409 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5410 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
5411 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5421 MI.eraseFromParent();
5432 if (Ty !=
S32 && Ty !=
S64)
5435 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5436 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5437 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5439 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5440 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5441 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5443 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5444 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5446 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5447 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5449 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5450 switch (
MI.getOpcode()) {
5453 case AMDGPU::G_SDIV: {
5454 DstDivReg =
MI.getOperand(0).getReg();
5458 case AMDGPU::G_SREM: {
5459 DstRemReg =
MI.getOperand(0).getReg();
5463 case AMDGPU::G_SDIVREM: {
5464 DstDivReg =
MI.getOperand(0).getReg();
5465 DstRemReg =
MI.getOperand(1).getReg();
5478 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5479 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5480 B.buildSub(DstDivReg, SignXor, Sign);
5484 auto Sign = LHSign.getReg(0);
5485 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5486 B.buildSub(DstRemReg, SignXor, Sign);
5489 MI.eraseFromParent();
5505 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5516 if (CLHS->isExactlyValue(1.0)) {
5517 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5521 MI.eraseFromParent();
5526 if (CLHS->isExactlyValue(-1.0)) {
5527 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5528 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5529 .addUse(FNeg.getReg(0))
5532 MI.eraseFromParent();
5539 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5544 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5547 B.buildFMul(Res, LHS, RCP, Flags);
5549 MI.eraseFromParent();
5564 if (!AllowInaccurateRcp)
5572 X =
B.buildFConstant(ResTy, 1.0).getReg(0);
5574 Register NegY = IsNegRcp ?
Y :
B.buildFNeg(ResTy,
Y).getReg(0);
5575 auto One =
B.buildFConstant(ResTy, 1.0);
5577 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5581 R =
B.buildFNeg(ResTy, R);
5583 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5584 R =
B.buildFMA(ResTy, Tmp0, R, R);
5586 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5587 R =
B.buildFMA(ResTy, Tmp1, R, R);
5591 B.buildCopy(Res, R);
5592 MI.eraseFromParent();
5596 auto Ret =
B.buildFMul(ResTy,
X, R);
5597 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5599 B.buildFMA(Res, Tmp2, R, Ret);
5600 MI.eraseFromParent();
5632 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5633 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5634 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5635 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5636 .addUse(RHSExt.getReg(0))
5638 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5640 if (ST.hasMadMacF32Insts()) {
5641 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5642 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5643 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5645 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5646 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5647 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5649 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5650 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5651 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5652 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5653 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5654 .addUse(RDst.getReg(0))
5659 MI.eraseFromParent();
5672 unsigned SPDenormMode =
5675 if (ST.hasDenormModeInst()) {
5677 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5679 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5680 B.buildInstr(AMDGPU::S_DENORM_MODE)
5681 .addImm(NewDenormModeValue);
5684 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5685 .addImm(SPDenormMode)
5707 auto One =
B.buildFConstant(
S32, 1.0f);
5709 auto DenominatorScaled =
5710 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5715 auto NumeratorScaled =
5716 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5722 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5723 .addUse(DenominatorScaled.getReg(0))
5725 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5728 const bool HasDynamicDenormals =
5733 if (!PreservesDenormals) {
5734 if (HasDynamicDenormals) {
5736 B.buildInstr(AMDGPU::S_GETREG_B32)
5737 .addDef(SavedSPDenormMode)
5743 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5744 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5745 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5746 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5747 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5748 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5750 if (!PreservesDenormals) {
5751 if (HasDynamicDenormals) {
5752 assert(SavedSPDenormMode);
5753 B.buildInstr(AMDGPU::S_SETREG_B32)
5754 .addReg(SavedSPDenormMode)
5760 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5761 .addUse(Fma4.getReg(0))
5762 .addUse(Fma1.getReg(0))
5763 .addUse(Fma3.getReg(0))
5764 .addUse(NumeratorScaled.getReg(1))
5767 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5768 .addUse(Fmas.getReg(0))
5773 MI.eraseFromParent();
5792 auto One =
B.buildFConstant(
S64, 1.0);
5794 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5800 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5802 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5803 .addUse(DivScale0.getReg(0))
5806 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5807 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5808 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5810 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5816 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5817 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5818 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5821 if (!ST.hasUsableDivScaleConditionOutput()) {
5827 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5828 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5829 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5830 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5833 Scale1Unmerge.getReg(1));
5835 Scale0Unmerge.getReg(1));
5836 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5838 Scale = DivScale1.getReg(1);
5841 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5842 .addUse(Fma4.getReg(0))
5843 .addUse(Fma3.getReg(0))
5844 .addUse(
Mul.getReg(0))
5848 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5849 .addUse(Fmas.getReg(0))
5854 MI.eraseFromParent();
5869 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5872 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5876 if (ST.hasFractBug()) {
5877 auto Fabs =
B.buildFAbs(Ty, Val);
5881 auto Zero =
B.buildConstant(InstrExpTy, 0);
5882 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5883 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5886 B.buildCopy(Res0, Mant);
5887 B.buildSExtOrTrunc(Res1, Exp);
5889 MI.eraseFromParent();
5904 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5907 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5908 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5909 auto C2 =
B.buildFConstant(
S32, 1.0f);
5912 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5914 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5916 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5917 .addUse(Mul0.getReg(0))
5920 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5922 B.buildFMul(Res, Sel, Mul1, Flags);
5924 MI.eraseFromParent();
5933 unsigned Flags =
MI.getFlags();
5934 assert(!ST.has16BitInsts());
5936 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5937 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5938 .addUse(Ext.getReg(0))
5940 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5941 MI.eraseFromParent();
5951 const unsigned Flags =
MI.getFlags();
5960 MI.eraseFromParent();
5964 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5966 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5967 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5968 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5973 .addUse(SqrtX.getReg(0))
5976 auto NegOne =
B.buildConstant(I32, -1);
5977 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5979 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5980 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5982 auto PosOne =
B.buildConstant(I32, 1);
5983 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5985 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5986 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5988 auto Zero =
B.buildFConstant(
F32, 0.0f);
5992 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5996 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5999 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
6000 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
6002 auto Half =
B.buildFConstant(
F32, 0.5f);
6003 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
6004 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
6005 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
6006 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
6007 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
6008 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
6009 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
6010 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
6013 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
6015 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
6017 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
6020 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
6022 MI.eraseFromParent();
6057 unsigned Flags =
MI.getFlags();
6062 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
6064 ZeroInt =
B.buildConstant(
S32, 0).getReg(0);
6068 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
6069 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
6070 SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags).getReg(0);
6073 auto SqrtY =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX);
6075 auto Half =
B.buildFConstant(
F64, 0.5);
6076 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
6077 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
6079 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
6080 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
6082 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
6083 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
6085 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
6086 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
6088 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
6090 Register SqrtRet = SqrtS2.getReg(0);
6092 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
6093 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
6094 auto SqrtD2 =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
6097 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
6098 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
6099 SqrtRet =
B.buildFLdexp(
F64, SqrtD2, ScaleDown, Flags).getReg(0);
6104 auto ZeroFP =
B.buildFConstant(
F64, 0.0);
6113 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6115 MI.eraseFromParent();
6146 auto Flags =
MI.getFlags();
6158 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6168 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6169 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6174 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6176 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6177 MI.eraseFromParent();
6189 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6190 IID == Intrinsic::amdgcn_permlanex16;
6191 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6192 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6193 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
6194 IID == Intrinsic::amdgcn_permlane_up ||
6195 IID == Intrinsic::amdgcn_permlane_down ||
6196 IID == Intrinsic::amdgcn_permlane_xor;
6200 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
6202 case Intrinsic::amdgcn_readfirstlane:
6203 case Intrinsic::amdgcn_permlane64:
6204 return LaneOp.getReg(0);
6205 case Intrinsic::amdgcn_readlane:
6206 case Intrinsic::amdgcn_set_inactive:
6207 case Intrinsic::amdgcn_set_inactive_chain_arg:
6208 return LaneOp.addUse(Src1).getReg(0);
6209 case Intrinsic::amdgcn_writelane:
6210 case Intrinsic::amdgcn_permlane_bcast:
6211 case Intrinsic::amdgcn_permlane_up:
6212 case Intrinsic::amdgcn_permlane_down:
6213 case Intrinsic::amdgcn_permlane_xor:
6214 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6215 case Intrinsic::amdgcn_permlane16:
6216 case Intrinsic::amdgcn_permlanex16: {
6218 int64_t Src4 =
MI.getOperand(6).getImm();
6219 int64_t Src5 =
MI.getOperand(7).getImm();
6220 return LaneOp.addUse(Src1)
6227 case Intrinsic::amdgcn_mov_dpp8:
6228 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
6229 case Intrinsic::amdgcn_update_dpp:
6230 return LaneOp.addUse(Src1)
6231 .addImm(
MI.getOperand(4).getImm())
6232 .addImm(
MI.getOperand(5).getImm())
6233 .addImm(
MI.getOperand(6).getImm())
6234 .addImm(
MI.getOperand(7).getImm())
6244 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6245 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
6246 IsPermlaneShuffle) {
6247 Src1 =
MI.getOperand(3).getReg();
6248 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ||
6249 IsPermlaneShuffle) {
6250 Src2 =
MI.getOperand(4).getReg();
6255 unsigned Size = Ty.getSizeInBits();
6257 unsigned SplitSize = 32;
6258 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
6259 ST.hasDPALU_DPP() &&
6263 if (
Size == SplitSize) {
6269 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
6271 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6274 if (IID == Intrinsic::amdgcn_writelane)
6277 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
6278 B.buildTrunc(DstReg, LaneOpDst);
6279 MI.eraseFromParent();
6283 if (
Size % SplitSize != 0)
6287 bool NeedsBitcast =
false;
6288 if (Ty.isVector()) {
6291 if (EltSize == SplitSize) {
6292 PartialResTy = EltTy;
6293 }
else if (EltSize == 16 || EltSize == 32) {
6294 unsigned NElem = SplitSize / EltSize;
6298 NeedsBitcast =
true;
6303 unsigned NumParts =
Size / SplitSize;
6307 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6308 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
6310 if (IID == Intrinsic::amdgcn_writelane)
6311 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
6313 for (
unsigned i = 0; i < NumParts; ++i) {
6314 Src0 = Src0Parts.
getReg(i);
6316 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6317 Src1 = Src1Parts.
getReg(i);
6319 if (IID == Intrinsic::amdgcn_writelane)
6320 Src2 = Src2Parts.
getReg(i);
6322 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6326 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
6329 B.buildMergeLikeInstr(DstReg, PartialRes);
6331 MI.eraseFromParent();
6339 ST.getTargetLowering()->getImplicitParameterOffset(
6349 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6350 B.buildConstant(IdxTy,
Offset).getReg(0));
6361 Register Pointer =
MI.getOperand(2).getReg();
6363 Register NumRecords =
MI.getOperand(4).getReg();
6369 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6371 auto ExtStride =
B.buildAnyExt(
S32, Stride);
6373 if (ST.has45BitNumRecordsBufferResource()) {
6378 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
6379 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
6380 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
6381 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
6385 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
6386 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
6387 auto ExtShiftedStride =
6388 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
6389 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
6390 auto ExtShiftedFlags =
6391 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
6392 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
6394 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
6395 B.buildMergeValues(Result, {LowHalf, HighHalf});
6397 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
6398 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
6399 auto LowHalf = Unmerge.getReg(0);
6400 auto HighHalf = Unmerge.getReg(1);
6402 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
6403 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
6404 auto ShiftConst =
B.buildConstant(
S32, 16);
6405 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
6406 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
6407 Register NewHighHalfReg = NewHighHalf.getReg(0);
6408 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6411 MI.eraseFromParent();
6428 MI.eraseFromParent();
6436 std::optional<uint32_t> KnownSize =
6438 if (KnownSize.has_value())
6439 B.buildConstant(DstReg, *KnownSize);
6457 MI.eraseFromParent();
6464 unsigned AddrSpace)
const {
6466 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6470 ST.hasGloballyAddressableScratch()) {
6472 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6473 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6475 MRI.
setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6477 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6479 B.buildConstant(
S32, 1u << 26));
6484 MI.eraseFromParent();
6494std::pair<Register, unsigned>
6506 bool CheckNUW = ST.hasGFX1250Insts();
6508 MRI, OrigOffset,
nullptr, CheckNUW);
6512 BaseReg =
B.buildPtrToInt(MRI.
getType(OrigOffset), BaseReg).getReg(0);
6522 unsigned Overflow = ImmOffset & ~MaxImm;
6523 ImmOffset -= Overflow;
6524 if ((int32_t)Overflow < 0) {
6525 Overflow += ImmOffset;
6529 if (Overflow != 0) {
6531 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6533 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6534 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6539 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6541 return std::pair(BaseReg, ImmOffset);
6548 bool ImageStore)
const {
6554 if (ST.hasUnpackedD16VMem()) {
6555 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6558 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6559 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6567 if (ImageStore && ST.hasImageStoreD16Bug()) {
6570 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6572 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6579 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6580 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6582 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6590 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6591 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6593 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6610 bool IsFormat)
const {
6622 VData =
B.buildBitcast(Ty, VData).getReg(0);
6630 if (Ty.isVector()) {
6631 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6643 bool IsFormat)
const {
6650 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6665 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6668 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6672 VIndex =
MI.getOperand(3).getReg();
6675 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6678 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6679 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6683 Format =
MI.getOperand(5 + OpOffset).getImm();
6687 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6693 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6694 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6695 }
else if (IsFormat) {
6696 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6697 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6701 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6704 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6707 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6712 auto MIB =
B.buildInstr(
Opc)
6723 MIB.addImm(AuxiliaryData)
6724 .addImm(HasVIndex ? -1 : 0)
6725 .addMemOperand(MMO);
6727 MI.eraseFromParent();
6733 unsigned ImmOffset,
unsigned Format,
6736 auto MIB =
B.buildInstr(
Opc)
6747 MIB.addImm(AuxiliaryData)
6748 .addImm(HasVIndex ? -1 : 0)
6749 .addMemOperand(MMO);
6755 bool IsTyped)
const {
6769 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6770 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6772 StatusDst =
MI.getOperand(1).getReg();
6777 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6780 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6783 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6786 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6789 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6792 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6793 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6797 Format =
MI.getOperand(5 + OpOffset).getImm();
6801 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6811 Dst =
MI.getOperand(0).getReg();
6812 B.setInsertPt(
B.getMBB(),
MI);
6819 Dst =
MI.getOperand(0).getReg();
6820 B.setInsertPt(
B.getMBB(),
MI);
6824 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6825 const bool Unpacked = ST.hasUnpackedD16VMem();
6835 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6836 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6837 }
else if (IsFormat) {
6841 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6843 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6844 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6849 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6850 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6853 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6854 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6857 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6858 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6864 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6865 unsigned NumLoadDWords = NumValueDWords + 1;
6867 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6869 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6871 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6872 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6873 B.buildTrunc(Dst, ExtDst);
6874 }
else if (NumValueDWords == 1) {
6875 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6878 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6879 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6881 B.buildUnmerge(LoadElts, LoadDstReg);
6883 B.buildMergeLikeInstr(Dst, LoadElts);
6886 (IsD16 && !Ty.isVector())) {
6887 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6889 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6890 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6891 B.buildTrunc(Dst, LoadDstReg);
6892 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6894 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6896 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6897 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6899 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6901 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6902 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6903 B.buildMergeLikeInstr(Dst, Repack);
6906 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6909 MI.eraseFromParent();
6915 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6916 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6917 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6918 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6919 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6920 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6921 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6922 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6923 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6924 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6925 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6926 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6927 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6928 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6929 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6930 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6931 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6932 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6933 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6934 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6935 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6936 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6937 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6938 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6939 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6940 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6941 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6942 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6943 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6944 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6945 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6946 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6947 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6948 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6949 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6950 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6951 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6952 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6953 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6954 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6955 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6956 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6957 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6958 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6959 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6960 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6961 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6962 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6963 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6964 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6965 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6966 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6967 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6968 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6969 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6970 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6971 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6972 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6973 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6974 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6975 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6976 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6977 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6978 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6979 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6980 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6981 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6982 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6983 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6984 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6985 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6986 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6987 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6988 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6989 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6990 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6991 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6992 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6993 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6994 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6995 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6996 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6997 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6998 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6999 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
7000 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
7001 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
7002 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
7003 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
7004 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
7013 const bool IsCmpSwap =
7014 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
7015 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
7016 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
7017 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
7028 CmpVal =
MI.getOperand(3).getReg();
7033 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
7034 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
7037 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
7040 VIndex =
MI.getOperand(4 + OpOffset).getReg();
7043 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
7046 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
7047 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
7048 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
7067 .addImm(AuxiliaryData)
7068 .addImm(HasVIndex ? -1 : 0)
7069 .addMemOperand(MMO);
7071 MI.eraseFromParent();
7081 bool IsA16,
bool IsG16) {
7097 (
B.getMRI()->getType(AddrReg) ==
S16)) {
7102 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7106 "Bias needs to be converted to 16 bit in A16 mode");
7108 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
7114 if (((
I + 1) >= EndIdx) ||
7121 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
7123 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7128 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7139 int DimIdx,
int NumVAddrs) {
7143 for (
int I = 0;
I != NumVAddrs; ++
I) {
7145 if (
SrcOp.isReg()) {
7151 int NumAddrRegs = AddrRegs.
size();
7152 if (NumAddrRegs != 1) {
7155 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7158 for (
int I = 1;
I != NumVAddrs; ++
I) {
7161 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
7183 const unsigned NumDefs =
MI.getNumExplicitDefs();
7184 const unsigned ArgOffset = NumDefs + 1;
7185 bool IsTFE = NumDefs == 2;
7203 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7207 const bool IsAtomicPacked16Bit =
7208 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7209 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7217 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
7218 const bool IsA16 = AddrTy ==
S16;
7219 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
7222 if (!BaseOpcode->
Atomic) {
7223 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
7226 }
else if (DMask != 0) {
7228 }
else if (!IsTFE && !BaseOpcode->
Store) {
7230 B.buildUndef(
MI.getOperand(0));
7231 MI.eraseFromParent();
7239 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7240 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7241 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7242 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7243 unsigned NewOpcode = LoadOpcode;
7244 if (BaseOpcode->
Store)
7245 NewOpcode = StoreOpcode;
7247 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7250 MI.setDesc(
B.getTII().get(NewOpcode));
7254 if (IsTFE && DMask == 0) {
7257 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
7260 if (BaseOpcode->
Atomic) {
7265 if (Ty.isVector() && !IsAtomicPacked16Bit)
7272 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
7273 MI.getOperand(2).setReg(
Concat.getReg(0));
7274 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7278 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
7281 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7287 if (IsA16 && !ST.hasA16()) {
7292 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
7293 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7295 if (IsA16 || IsG16) {
7303 const bool UseNSA = ST.hasNSAEncoding() &&
7304 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
7305 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
7306 const bool UsePartialNSA =
7307 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
7309 if (UsePartialNSA) {
7313 auto Concat =
B.buildConcatVectors(
7314 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7315 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
7316 PackedRegs.
resize(NSAMaxSize);
7317 }
else if (!UseNSA && PackedRegs.
size() > 1) {
7319 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
7320 PackedRegs[0] =
Concat.getReg(0);
7324 const unsigned NumPacked = PackedRegs.
size();
7327 if (!
SrcOp.isReg()) {
7337 SrcOp.setReg(AMDGPU::NoRegister);
7354 const bool UseNSA = ST.hasNSAEncoding() &&
7355 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7356 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7357 const bool UsePartialNSA =
7358 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7360 if (UsePartialNSA) {
7362 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
7364 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
7379 if (!Ty.isVector() || !IsD16)
7383 if (RepackedReg != VData) {
7384 MI.getOperand(1).setReg(RepackedReg);
7392 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7395 if (NumElts < DMaskLanes)
7398 if (NumElts > 4 || DMaskLanes > 4)
7408 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7409 const LLT AdjustedTy =
7425 if (IsD16 && ST.hasUnpackedD16VMem()) {
7432 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
7433 unsigned RoundedSize = 32 * RoundedElts;
7437 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
7442 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
7448 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
7452 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7453 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
7457 MI.getOperand(0).setReg(NewResultReg);
7465 Dst1Reg =
MI.getOperand(1).getReg();
7470 MI.removeOperand(1);
7474 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7483 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7485 if (ResultNumRegs == 1) {
7487 ResultRegs[0] = NewResultReg;
7490 for (
int I = 0;
I != NumDataRegs; ++
I)
7492 B.buildUnmerge(ResultRegs, NewResultReg);
7497 ResultRegs.
resize(NumDataRegs);
7502 if (IsD16 && !Ty.isVector()) {
7503 B.buildTrunc(DstReg, ResultRegs[0]);
7508 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7509 B.buildBitcast(DstReg, ResultRegs[0]);
7521 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7523 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7524 }
else if (ST.hasUnpackedD16VMem()) {
7526 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7530 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7534 for (
int I = 0;
I != NumElts; ++
I)
7541 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7542 B.buildBuildVector(DstReg, ResultRegs);
7546 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7547 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7553 if (ResultRegs.
size() == 1) {
7554 NewResultReg = ResultRegs[0];
7555 }
else if (ResultRegs.
size() == 2) {
7557 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7565 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7567 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7572 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7573 B.buildConcatVectors(DstReg, ResultRegs);
7582 Register OrigDst =
MI.getOperand(0).getReg();
7584 LLT Ty =
B.getMRI()->getType(OrigDst);
7585 unsigned Size = Ty.getSizeInBits();
7588 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7590 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7591 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7594 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7596 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7605 B.setInsertPt(
B.getMBB(),
MI);
7610 B.setInsertPt(
B.getMBB(),
MI);
7616 MI.setDesc(
B.getTII().get(
Opc));
7617 MI.removeOperand(1);
7620 const unsigned MemSize = (
Size + 7) / 8;
7621 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7628 MI.addMemOperand(MF, MMO);
7629 if (Dst != OrigDst) {
7630 MI.getOperand(0).setReg(Dst);
7631 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7632 B.buildTrunc(OrigDst, Dst);
7654 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7655 MI.removeOperand(0);
7665 if (!ST.hasTrapHandler() ||
7669 return ST.supportsGetDoorbellID() ?
7682 MI.eraseFromParent();
7692 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7694 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7698 MI.eraseFromParent();
7707 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7714 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7734 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7737 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7738 B.buildCopy(SGPR01, Temp);
7739 B.buildInstr(AMDGPU::S_TRAP)
7742 MI.eraseFromParent();
7753 B.buildCopy(SGPR01, LiveIn);
7754 B.buildInstr(AMDGPU::S_TRAP)
7758 MI.eraseFromParent();
7767 if (ST.hasPrivEnabledTrap2NopBug()) {
7768 ST.getInstrInfo()->insertSimulatedTrap(MRI,
B.getMBB(),
MI,
7770 MI.eraseFromParent();
7774 B.buildInstr(AMDGPU::S_TRAP)
7776 MI.eraseFromParent();
7785 if (!ST.hasTrapHandler() ||
7789 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7792 B.buildInstr(AMDGPU::S_TRAP)
7796 MI.eraseFromParent();
7809 Register NodePtr =
MI.getOperand(2).getReg();
7810 Register RayExtent =
MI.getOperand(3).getReg();
7811 Register RayOrigin =
MI.getOperand(4).getReg();
7813 Register RayInvDir =
MI.getOperand(6).getReg();
7816 if (!ST.hasGFX10_AEncoding()) {
7819 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7828 const unsigned NumVDataDwords = 4;
7829 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7830 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7832 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7834 const unsigned BaseOpcodes[2][2] = {
7835 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7836 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7837 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7841 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7842 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7843 : AMDGPU::MIMGEncGfx10NSA,
7844 NumVDataDwords, NumVAddrDwords);
7848 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7849 : AMDGPU::MIMGEncGfx10Default,
7850 NumVDataDwords, NumVAddrDwords);
7855 if (UseNSA && IsGFX11Plus) {
7857 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7858 auto Merged =
B.buildMergeLikeInstr(
7859 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7860 Ops.push_back(Merged.getReg(0));
7863 Ops.push_back(NodePtr);
7864 Ops.push_back(RayExtent);
7865 packLanes(RayOrigin);
7868 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7869 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7870 auto MergedDir =
B.buildMergeLikeInstr(
7873 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7874 UnmergeRayDir.getReg(0)}))
7877 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7878 UnmergeRayDir.getReg(1)}))
7881 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7882 UnmergeRayDir.getReg(2)}))
7884 Ops.push_back(MergedDir.getReg(0));
7887 packLanes(RayInvDir);
7891 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7892 Ops.push_back(Unmerge.getReg(0));
7893 Ops.push_back(Unmerge.getReg(1));
7895 Ops.push_back(NodePtr);
7897 Ops.push_back(RayExtent);
7900 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7901 Ops.push_back(Unmerge.getReg(0));
7902 Ops.push_back(Unmerge.getReg(1));
7903 Ops.push_back(Unmerge.getReg(2));
7906 packLanes(RayOrigin);
7908 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7909 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7913 B.buildMergeLikeInstr(R1,
7914 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7915 B.buildMergeLikeInstr(
7916 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7917 B.buildMergeLikeInstr(
7918 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7924 packLanes(RayInvDir);
7931 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7933 Ops.push_back(MergedOps);
7936 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7945 .addImm(IsA16 ? 1 : 0)
7948 MI.eraseFromParent();
7958 Register DstOrigin =
MI.getOperand(1).getReg();
7960 Register NodePtr =
MI.getOperand(4).getReg();
7961 Register RayExtent =
MI.getOperand(5).getReg();
7962 Register InstanceMask =
MI.getOperand(6).getReg();
7963 Register RayOrigin =
MI.getOperand(7).getReg();
7965 Register Offsets =
MI.getOperand(9).getReg();
7966 Register TDescr =
MI.getOperand(10).getReg();
7968 if (!ST.hasBVHDualAndBVH8Insts()) {
7971 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7976 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7977 const unsigned NumVDataDwords = 10;
7978 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7980 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7981 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7982 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7985 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7986 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7988 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7989 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7995 .addUse(RayExtentInstanceMaskVec.getReg(0))
8002 MI.eraseFromParent();
8011 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
8012 MI.eraseFromParent();
8019 if (!ST.hasArchitectedSGPRs())
8023 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
8024 auto LSB =
B.buildConstant(
S32, 25);
8025 auto Width =
B.buildConstant(
S32, 5);
8026 B.buildUbfx(DstReg, TTMP8, LSB, Width);
8027 MI.eraseFromParent();
8035 unsigned Width)
const {
8039 MRI.
setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
8040 B.buildInstr(AMDGPU::S_GETREG_B32_const)
8043 MI.eraseFromParent();
8061 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
8065 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
8068 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
8069 MI.eraseFromParent();
8080 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
8084 .addReg(Unmerge.getReg(0));
8088 .addReg(Unmerge.getReg(1));
8089 MI.eraseFromParent();
8101 case Intrinsic::amdgcn_icmp: {
8112 if (!Src1Const || Src1Const->Value != 0)
8116 int64_t Pred =
MI.getOperand(4).getImm();
8122 B.buildIntrinsic(Intrinsic::amdgcn_ballot, Dst).addUse(Src0);
8123 MI.eraseFromParent();
8126 case Intrinsic::sponentry:
8132 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8135 B.buildIntToPtr(DstReg, TmpReg);
8136 MI.eraseFromParent();
8138 int FI =
B.getMF().getFrameInfo().CreateFixedObject(
8140 B.buildFrameIndex(
MI.getOperand(0), FI);
8141 MI.eraseFromParent();
8144 case Intrinsic::amdgcn_if:
8145 case Intrinsic::amdgcn_else: {
8148 bool Negated =
false;
8160 std::swap(CondBrTarget, UncondBrTarget);
8162 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8163 if (IntrID == Intrinsic::amdgcn_if) {
8164 B.buildInstr(AMDGPU::SI_IF)
8167 .addMBB(UncondBrTarget);
8169 B.buildInstr(AMDGPU::SI_ELSE)
8172 .addMBB(UncondBrTarget);
8181 B.buildBr(*CondBrTarget);
8186 MI.eraseFromParent();
8187 BrCond->eraseFromParent();
8193 case Intrinsic::amdgcn_loop: {
8196 bool Negated =
false;
8206 std::swap(CondBrTarget, UncondBrTarget);
8208 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8209 B.buildInstr(AMDGPU::SI_LOOP)
8211 .addMBB(UncondBrTarget);
8216 B.buildBr(*CondBrTarget);
8218 MI.eraseFromParent();
8219 BrCond->eraseFromParent();
8226 case Intrinsic::amdgcn_wave_reduce_min:
8227 case Intrinsic::amdgcn_wave_reduce_umin:
8228 case Intrinsic::amdgcn_wave_reduce_max:
8229 case Intrinsic::amdgcn_wave_reduce_umax:
8230 case Intrinsic::amdgcn_wave_reduce_add:
8231 case Intrinsic::amdgcn_wave_reduce_sub:
8232 case Intrinsic::amdgcn_wave_reduce_and:
8233 case Intrinsic::amdgcn_wave_reduce_or:
8234 case Intrinsic::amdgcn_wave_reduce_xor: {
8239 bool NeedsSignExt = IntrID == Intrinsic::amdgcn_wave_reduce_min ||
8240 IntrID == Intrinsic::amdgcn_wave_reduce_max ||
8241 IntrID == Intrinsic::amdgcn_wave_reduce_add ||
8242 IntrID == Intrinsic::amdgcn_wave_reduce_sub;
8243 auto Ext = NeedsSignExt ?
B.buildSExt(
LLT::scalar(32), SrcReg)
8248 .addUse(Ext.getReg(0))
8249 .addImm(
MI.getOperand(3).getImm());
8250 B.buildTrunc(DstReg, NewDst);
8251 MI.eraseFromParent();
8254 case Intrinsic::amdgcn_addrspacecast_nonnull:
8256 case Intrinsic::amdgcn_make_buffer_rsrc:
8258 case Intrinsic::amdgcn_kernarg_segment_ptr:
8261 B.buildConstant(
MI.getOperand(0).getReg(), 0);
8262 MI.eraseFromParent();
8268 case Intrinsic::amdgcn_implicitarg_ptr:
8270 case Intrinsic::amdgcn_workitem_id_x:
8273 case Intrinsic::amdgcn_workitem_id_y:
8276 case Intrinsic::amdgcn_workitem_id_z:
8279 case Intrinsic::amdgcn_workgroup_id_x:
8284 case Intrinsic::amdgcn_workgroup_id_y:
8289 case Intrinsic::amdgcn_workgroup_id_z:
8294 case Intrinsic::amdgcn_cluster_id_x:
8295 return ST.hasClusters() &&
8298 case Intrinsic::amdgcn_cluster_id_y:
8299 return ST.hasClusters() &&
8302 case Intrinsic::amdgcn_cluster_id_z:
8303 return ST.hasClusters() &&
8306 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8307 return ST.hasClusters() &&
8310 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8311 return ST.hasClusters() &&
8314 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8315 return ST.hasClusters() &&
8318 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8319 return ST.hasClusters() &&
8321 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8322 return ST.hasClusters() &&
8325 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8326 return ST.hasClusters() &&
8329 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8330 return ST.hasClusters() &&
8333 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8334 return ST.hasClusters() &&
8338 case Intrinsic::amdgcn_wave_id:
8340 case Intrinsic::amdgcn_lds_kernel_id:
8343 case Intrinsic::amdgcn_dispatch_ptr:
8346 case Intrinsic::amdgcn_queue_ptr:
8349 case Intrinsic::amdgcn_implicit_buffer_ptr:
8352 case Intrinsic::amdgcn_dispatch_id:
8355 case Intrinsic::r600_read_ngroups_x:
8359 case Intrinsic::r600_read_ngroups_y:
8362 case Intrinsic::r600_read_ngroups_z:
8365 case Intrinsic::r600_read_local_size_x:
8368 case Intrinsic::r600_read_local_size_y:
8372 case Intrinsic::r600_read_local_size_z:
8375 case Intrinsic::amdgcn_fdiv_fast:
8377 case Intrinsic::amdgcn_is_shared:
8379 case Intrinsic::amdgcn_is_private:
8381 case Intrinsic::amdgcn_wavefrontsize: {
8382 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
8383 MI.eraseFromParent();
8386 case Intrinsic::amdgcn_s_buffer_load:
8388 case Intrinsic::amdgcn_raw_buffer_store:
8389 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8390 case Intrinsic::amdgcn_struct_buffer_store:
8391 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8393 case Intrinsic::amdgcn_raw_buffer_store_format:
8394 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8395 case Intrinsic::amdgcn_struct_buffer_store_format:
8396 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8398 case Intrinsic::amdgcn_raw_tbuffer_store:
8399 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8400 case Intrinsic::amdgcn_struct_tbuffer_store:
8401 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8403 case Intrinsic::amdgcn_raw_buffer_load:
8404 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8405 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8406 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8407 case Intrinsic::amdgcn_struct_buffer_load:
8408 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8409 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8410 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8412 case Intrinsic::amdgcn_raw_buffer_load_format:
8413 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8414 case Intrinsic::amdgcn_struct_buffer_load_format:
8415 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8417 case Intrinsic::amdgcn_raw_tbuffer_load:
8418 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8419 case Intrinsic::amdgcn_struct_tbuffer_load:
8420 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8422 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8423 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8424 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8425 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8426 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8427 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8428 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8429 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8430 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8431 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8432 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8433 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8434 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8435 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8436 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8437 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8438 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8439 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8440 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8441 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8442 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8443 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8444 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8445 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8446 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8447 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8448 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8449 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8450 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8451 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8452 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8453 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8454 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8455 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8456 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8457 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8458 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8459 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8460 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8461 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8462 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8463 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8464 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8465 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8466 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8467 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8468 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8469 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8470 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8471 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8472 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8473 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8474 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8475 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8476 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8477 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8478 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8479 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8480 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8481 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8482 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8483 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8484 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8485 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8486 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8487 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8488 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8489 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8490 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8491 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8492 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8493 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8495 case Intrinsic::amdgcn_rsq_clamp:
8497 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8499 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8500 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8502 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8503 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8504 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8505 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8506 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8507 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8508 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8509 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8513 if (IndexArgTy !=
S64) {
8514 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(
S64, Index)
8515 :
B.buildAnyExt(
S64, Index);
8516 MI.getOperand(5).setReg(NewIndex.getReg(0));
8520 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8521 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8522 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8523 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8524 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8525 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8526 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8527 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8531 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
8534 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8535 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8536 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8537 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8538 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8539 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8540 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8541 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8542 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8544 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8548 if (IndexArgTy != IdxTy) {
8549 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(IdxTy, Index)
8550 :
B.buildAnyExt(IdxTy, Index);
8551 MI.getOperand(7).setReg(NewIndex.getReg(0));
8556 case Intrinsic::amdgcn_fmed3: {
8562 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8563 MI.removeOperand(1);
8567 case Intrinsic::amdgcn_readlane:
8568 case Intrinsic::amdgcn_writelane:
8569 case Intrinsic::amdgcn_readfirstlane:
8570 case Intrinsic::amdgcn_permlane16:
8571 case Intrinsic::amdgcn_permlanex16:
8572 case Intrinsic::amdgcn_permlane64:
8573 case Intrinsic::amdgcn_set_inactive:
8574 case Intrinsic::amdgcn_set_inactive_chain_arg:
8575 case Intrinsic::amdgcn_mov_dpp8:
8576 case Intrinsic::amdgcn_update_dpp:
8577 case Intrinsic::amdgcn_permlane_bcast:
8578 case Intrinsic::amdgcn_permlane_up:
8579 case Intrinsic::amdgcn_permlane_down:
8580 case Intrinsic::amdgcn_permlane_xor:
8582 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8584 case Intrinsic::amdgcn_dead: {
8588 MI.eraseFromParent();
8591 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8592 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8593 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8594 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8595 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8596 MI.eraseFromParent();
8598 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8599 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8600 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8601 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8602 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8603 MI.eraseFromParent();
8605 case Intrinsic::amdgcn_av_load_b128:
8606 case Intrinsic::amdgcn_av_store_b128: {
8608 if (!ST.hasFlatGlobalInsts()) {
8609 const char *Name = IntrID == Intrinsic::amdgcn_av_load_b128
8610 ?
"llvm.amdgcn.av.load.b128"
8611 :
"llvm.amdgcn.av.store.b128";
8614 Fn,
Twine(Name) +
" not supported on subtarget",
MI.getDebugLoc()));
8617 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8618 if (IntrID == Intrinsic::amdgcn_av_load_b128)
8619 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8621 B.buildStore(
MI.getOperand(2),
MI.getOperand(1),
8622 **
MI.memoperands_begin());
8623 MI.eraseFromParent();
8626 case Intrinsic::amdgcn_flat_load_monitor_b32:
8627 case Intrinsic::amdgcn_flat_load_monitor_b64:
8628 case Intrinsic::amdgcn_flat_load_monitor_b128:
8629 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8630 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8631 .add(
MI.getOperand(0))
8632 .add(
MI.getOperand(2))
8633 .addMemOperand(*
MI.memoperands_begin());
8634 MI.eraseFromParent();
8636 case Intrinsic::amdgcn_global_load_monitor_b32:
8637 case Intrinsic::amdgcn_global_load_monitor_b64:
8638 case Intrinsic::amdgcn_global_load_monitor_b128:
8639 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8640 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8641 .add(
MI.getOperand(0))
8642 .add(
MI.getOperand(2))
8643 .addMemOperand(*
MI.memoperands_begin());
8644 MI.eraseFromParent();
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
bool isModuleEntryFunction() const
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isBottomOfStack() const
bool isEntryFunction() const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
Get the array size.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
ConstantFP - Floating Point Values [float, double].
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr bool isAnyScalar() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty if condition is met.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.