37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
40#define DEBUG_TYPE "amdgpu-legalinfo"
50 "amdgpu-global-isel-new-legality",
51 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
67 unsigned Bits = Ty.getSizeInBits();
77 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
91 const LLT Ty = Query.Types[TypeIdx];
98 const LLT Ty = Query.Types[TypeIdx];
100 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
106 const LLT Ty = Query.Types[TypeIdx];
108 return std::pair(TypeIdx,
115 const LLT Ty = Query.Types[TypeIdx];
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (
Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
129 const LLT Ty = Query.Types[TypeIdx];
132 const int Size = Ty.getSizeInBits();
134 const int NextMul32 = (
Size + 31) / 32;
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
154 const LLT Ty = Query.Types[TypeIdx];
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
159 assert(EltSize == 32 || EltSize == 64);
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
168 return std::pair(TypeIdx,
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
188 const unsigned Size = Ty.getSizeInBits();
201 const LLT Ty = Query.Types[TypeIdx];
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
218 const LLT QueryTy = Query.Types[TypeIdx];
225 const LLT QueryTy = Query.Types[TypeIdx];
232 const LLT QueryTy = Query.Types[TypeIdx];
238 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
244 return EltSize == 16 || EltSize % 32 == 0;
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
280 LLT Ty = Query.Types[TypeIdx];
288 const LLT QueryTy = Query.Types[TypeIdx];
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
378 (ST.useRealTrue16Insts() && Ty ==
S16) ||
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
413 bool IsLoad,
bool IsAtomic) {
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
430 return IsLoad ? 512 : 128;
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
444 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
446 unsigned RegSize = Ty.getSizeInBits();
449 unsigned AS = Query.
Types[1].getAddressSpace();
456 if (Ty.isVector() && MemSize !=
RegSize)
463 if (IsLoad && MemSize <
Size)
464 MemSize = std::max(MemSize,
Align);
484 if (!ST.hasDwordx3LoadStores())
497 if (AlignBits < MemSize) {
500 Align(AlignBits / 8)))
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
556 const unsigned Size = Ty.getSizeInBits();
557 if (
Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
564 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
573 uint64_t AlignInBits,
unsigned AddrSpace,
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
594 if (AlignInBits < RoundedSize)
601 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
613 Query.
Types[1].getAddressSpace(), Opcode);
633 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
639 for (
unsigned I = 0;
I < NumParts; ++
I)
641 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
647 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
648 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
669 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
670 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
671 for (
unsigned I = 0;
I < NumParts; ++
I)
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
675 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
695 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
708 const LLT BufferStridedPtr =
711 const LLT CodePtr = FlatPtr;
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
723 const std::initializer_list<LLT> FPTypesBase = {
727 const std::initializer_list<LLT> FPTypes16 = {
731 const std::initializer_list<LLT> FPTypesPK16 = {
735 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
758 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
760 if (ST.hasScalarAddSub64()) {
763 .clampMaxNumElementsStrict(0,
S16, 2)
771 .clampMaxNumElementsStrict(0,
S16, 2)
778 if (ST.hasScalarSMulU64()) {
781 .clampMaxNumElementsStrict(0,
S16, 2)
789 .clampMaxNumElementsStrict(0,
S16, 2)
799 .minScalarOrElt(0,
S16)
804 }
else if (ST.has16BitInsts()) {
838 .widenScalarToNextMultipleOf(0, 32)
848 if (ST.hasMad64_32())
853 if (ST.hasIntClamp()) {
876 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
886 if (ST.hasVOP3PInsts()) {
888 .clampMaxNumElements(0,
S8, 2)
909 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
921 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
928 .clampScalar(0,
S16,
S64);
961 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
962 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
969 if (ST.has16BitInsts()) {
970 if (ST.hasVOP3PInsts())
973 FPOpActions.legalFor({
S16});
975 TrigActions.customFor({
S16});
976 FDIVActions.customFor({
S16});
979 if (ST.hasPackedFP32Ops()) {
980 FPOpActions.legalFor({
V2S32});
981 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
984 auto &MinNumMaxNumIeee =
987 if (ST.hasVOP3PInsts()) {
988 MinNumMaxNumIeee.legalFor(FPTypesPK16)
990 .clampMaxNumElements(0,
S16, 2)
993 }
else if (ST.has16BitInsts()) {
994 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
996 MinNumMaxNumIeee.legalFor(FPTypesBase)
1002 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1004 if (ST.hasVOP3PInsts()) {
1005 MinNumMaxNum.customFor(FPTypesPK16)
1007 .clampMaxNumElements(0,
S16, 2)
1008 .clampScalar(0,
S16,
S64)
1010 }
else if (ST.has16BitInsts()) {
1011 MinNumMaxNum.customFor(FPTypes16)
1012 .clampScalar(0,
S16,
S64)
1015 MinNumMaxNum.customFor(FPTypesBase)
1016 .clampScalar(0,
S32,
S64)
1020 if (ST.hasVOP3PInsts())
1037 .
legalFor(ST.hasPackedFP32Ops(), {V2S32})
1039 if (ST.hasPackedFP32Ops())
1043 if (ST.has16BitInsts()) {
1077 if (ST.hasFractBug()) {
1111 if (ST.hasCvtPkF16F32Inst()) {
1113 .clampMaxNumElements(0,
S16, 2);
1117 FPTruncActions.scalarize(0).lower();
1125 if (ST.has16BitInsts()) {
1139 if (ST.hasPackedFP32Ops())
1149 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1150 FMad.customFor({
S32,
S16});
1151 else if (ST.hasMadMacF32Insts())
1152 FMad.customFor({
S32});
1153 else if (ST.hasMadF16())
1154 FMad.customFor({
S16});
1159 if (ST.has16BitInsts()) {
1162 FRem.minScalar(0,
S32)
1171 .clampMaxNumElements(0,
S16, 2)
1190 if (ST.has16BitInsts())
1201 if (ST.has16BitInsts())
1214 .legalFor(ST.has16BitInsts(),{{S16, S16}})
1218 if (
ST.has16BitInsts())
1228 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1229 .clampScalar(0,
S16,
S64)
1233 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1239 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1243 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1244 .clampScalar(0,
S16,
S64)
1248 if (
ST.has16BitInsts()) {
1249 getActionDefinitionsBuilder(
1250 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1252 .clampScalar(0,
S16,
S64)
1255 getActionDefinitionsBuilder(
1256 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1258 .clampScalar(0,
S32,
S64)
1261 getActionDefinitionsBuilder(
1262 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1265 .clampScalar(0,
S32,
S64)
1269 getActionDefinitionsBuilder(G_PTR_ADD)
1275 getActionDefinitionsBuilder(G_PTRMASK)
1277 .scalarSameSizeAs(1, 0)
1281 getActionDefinitionsBuilder(G_ICMP)
1293 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1294 .legalForCartesianProduct(
1295 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1296 if (
ST.has16BitInsts()) {
1297 CmpBuilder.legalFor({{
S1,
S16}});
1308 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1310 if (
ST.hasSALUFloatInsts())
1319 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1320 if (
ST.has16BitInsts())
1321 ExpOps.customFor({{
S32}, {
S16}});
1323 ExpOps.customFor({
S32});
1324 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1327 getActionDefinitionsBuilder(G_FPOWI)
1328 .clampScalar(0, MinScalarFPTy,
S32)
1331 getActionDefinitionsBuilder(G_FLOG2)
1332 .legalFor(
ST.has16BitInsts(), {S16})
1337 getActionDefinitionsBuilder(G_FEXP2)
1338 .legalFor(
ST.has16BitInsts(), {S16})
1344 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1346 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1350 getActionDefinitionsBuilder(G_CTPOP)
1352 .clampScalar(0,
S32,
S32)
1353 .widenScalarToNextPow2(1, 32)
1354 .clampScalar(1,
S32,
S64)
1356 .widenScalarToNextPow2(0, 32);
1359 if (
ST.has16BitInsts())
1360 getActionDefinitionsBuilder(G_IS_FPCLASS)
1361 .legalForCartesianProduct({
S1}, FPTypes16)
1362 .widenScalarToNextPow2(1)
1366 getActionDefinitionsBuilder(G_IS_FPCLASS)
1367 .legalForCartesianProduct({
S1}, FPTypesBase)
1368 .lowerFor({
S1,
S16})
1369 .widenScalarToNextPow2(1)
1376 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1378 .clampScalar(0,
S32,
S32)
1379 .clampScalar(1,
S32,
S64)
1380 .widenScalarToNextPow2(0, 32)
1381 .widenScalarToNextPow2(1, 32)
1385 getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON)
1388 .clampScalar(0,
S32,
S32)
1389 .clampScalar(1,
S32,
S64)
1391 .widenScalarToNextPow2(0, 32)
1392 .widenScalarToNextPow2(1, 32);
1394 getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON)
1396 .clampScalar(0,
S32,
S32)
1397 .clampScalar(1,
S32,
S64)
1399 .widenScalarToNextPow2(0, 32)
1400 .widenScalarToNextPow2(1, 32);
1402 getActionDefinitionsBuilder(G_CTLS)
1405 .clampScalar(0,
S32,
S32)
1406 .clampScalar(1,
S32,
S32);
1410 getActionDefinitionsBuilder(G_BITREVERSE)
1412 .clampScalar(0,
S32,
S64)
1414 .widenScalarToNextPow2(0);
1416 if (
ST.has16BitInsts()) {
1417 getActionDefinitionsBuilder(G_BSWAP)
1419 .clampMaxNumElementsStrict(0,
S16, 2)
1422 .widenScalarToNextPow2(0)
1423 .clampScalar(0,
S16,
S32)
1426 if (
ST.hasVOP3PInsts()) {
1427 getActionDefinitionsBuilder(G_ABS)
1429 .clampMaxNumElements(0,
S16, 2)
1431 .widenScalarToNextPow2(0)
1434 if (
ST.hasIntMinMax64()) {
1435 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1437 .clampMaxNumElements(0,
S16, 2)
1439 .widenScalarToNextPow2(0)
1443 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1445 .clampMaxNumElements(0,
S16, 2)
1447 .widenScalarToNextPow2(0)
1452 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1454 .widenScalarToNextPow2(0)
1461 getActionDefinitionsBuilder(G_BSWAP)
1466 .widenScalarToNextPow2(0)
1471 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1474 .widenScalarToNextPow2(0)
1479 getActionDefinitionsBuilder(G_INTTOPTR)
1481 .legalForCartesianProduct(AddrSpaces64, {
S64})
1482 .legalForCartesianProduct(AddrSpaces32, {
S32})
1495 getActionDefinitionsBuilder(G_PTRTOINT)
1497 .legalForCartesianProduct(AddrSpaces64, {
S64})
1498 .legalForCartesianProduct(AddrSpaces32, {
S32})
1511 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1515 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1516 bool IsLoad) ->
bool {
1520 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1534 unsigned NumRegs = (MemSize + 31) / 32;
1536 if (!
ST.hasDwordx3LoadStores())
1547 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1548 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1549 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1555 for (
unsigned Op : {G_LOAD, G_STORE}) {
1556 const bool IsStore =
Op == G_STORE;
1558 auto &Actions = getActionDefinitionsBuilder(
Op);
1561 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1564 {
S64, GlobalPtr,
S64, GlobalAlign32},
1567 {
S32, GlobalPtr,
S8, GlobalAlign8},
1568 {
S32, GlobalPtr,
S16, GlobalAlign16},
1570 {
S32, LocalPtr,
S32, 32},
1571 {
S64, LocalPtr,
S64, 32},
1573 {
S32, LocalPtr,
S8, 8},
1574 {
S32, LocalPtr,
S16, 16},
1577 {
S32, PrivatePtr,
S32, 32},
1578 {
S32, PrivatePtr,
S8, 8},
1579 {
S32, PrivatePtr,
S16, 16},
1582 {
S32, ConstantPtr,
S32, GlobalAlign32},
1585 {
S64, ConstantPtr,
S64, GlobalAlign32},
1586 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1588 Actions.legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1589 {{S16, GlobalPtr, S8, GlobalAlign8},
1590 {S16, GlobalPtr, S16, GlobalAlign16},
1591 {S16, LocalPtr, S8, 8},
1592 {S16, LocalPtr, S16, 16},
1593 {S16, PrivatePtr, S8, 8},
1594 {S16, PrivatePtr, S16, 16}});
1604 Actions.unsupportedIf(
1605 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1619 Actions.customIf(
typeIs(1, Constant32Ptr));
1645 return !Query.
Types[0].isVector() &&
1646 needToSplitMemOp(Query,
Op == G_LOAD);
1648 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1653 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1656 if (DstSize > MemSize)
1662 if (MemSize > MaxSize)
1670 return Query.
Types[0].isVector() &&
1671 needToSplitMemOp(Query,
Op == G_LOAD);
1673 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1687 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1688 if (MemSize > MaxSize) {
1692 if (MaxSize % EltSize == 0) {
1698 unsigned NumPieces = MemSize / MaxSize;
1702 if (NumPieces == 1 || NumPieces >= NumElts ||
1703 NumElts % NumPieces != 0)
1704 return std::pair(0, EltTy);
1712 return std::pair(0, EltTy);
1727 return std::pair(0, EltTy);
1732 .widenScalarToNextPow2(0)
1738 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1739 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1740 {
S32, GlobalPtr,
S16, 2 * 8},
1741 {
S32, LocalPtr,
S8, 8},
1742 {
S32, LocalPtr,
S16, 16},
1743 {
S32, PrivatePtr,
S8, 8},
1744 {
S32, PrivatePtr,
S16, 16},
1745 {
S32, ConstantPtr,
S8, 8},
1746 {
S32, ConstantPtr,
S16, 2 * 8}})
1752 if (
ST.hasFlatAddressSpace()) {
1753 ExtLoads.legalForTypesWithMemDesc(
1754 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1762 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1764 ExtLoads.narrowScalarIf(
1771 ExtLoads.clampScalar(0,
S32,
S32)
1772 .widenScalarToNextPow2(0)
1775 auto &Atomics = getActionDefinitionsBuilder(
1776 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1777 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1778 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1779 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1780 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1781 {
S64, GlobalPtr}, {
S64, LocalPtr},
1782 {
S32, RegionPtr}, {
S64, RegionPtr}});
1783 if (
ST.hasFlatAddressSpace()) {
1784 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1788 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1789 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr}, {
S32, RegionPtr}});
1790 if (
ST.hasFlatAddressSpace()) {
1791 Atomics32.legalFor({{
S32, FlatPtr}});
1795 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1796 if (
ST.hasLDSFPAtomicAddF32()) {
1797 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1798 if (
ST.hasLdsAtomicAddF64())
1799 Atomic.legalFor({{
S64, LocalPtr}});
1800 if (
ST.hasAtomicDsPkAdd16Insts())
1801 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1803 if (
ST.hasAtomicFaddInsts())
1804 Atomic.legalFor({{
S32, GlobalPtr}});
1805 if (
ST.hasFlatAtomicFaddF32Inst())
1806 Atomic.legalFor({{
S32, FlatPtr}});
1808 if (
ST.hasGFX90AInsts() ||
ST.hasGFX1250Insts()) {
1819 if (
ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1820 ST.hasAtomicBufferGlobalPkAddF16Insts())
1821 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1822 if (
ST.hasAtomicGlobalPkAddBF16Inst())
1823 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1824 if (
ST.hasAtomicFlatPkAdd16Insts())
1825 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1830 auto &AtomicFMinFMax =
1831 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1832 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1834 if (
ST.hasAtomicFMinFMaxF32GlobalInsts())
1835 AtomicFMinFMax.legalFor({{
F32, GlobalPtr},{
F32, BufferFatPtr}});
1836 if (
ST.hasAtomicFMinFMaxF64GlobalInsts())
1837 AtomicFMinFMax.legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1838 if (
ST.hasAtomicFMinFMaxF32FlatInsts())
1839 AtomicFMinFMax.legalFor({
F32, FlatPtr});
1840 if (
ST.hasAtomicFMinFMaxF64FlatInsts())
1841 AtomicFMinFMax.legalFor({
F64, FlatPtr});
1845 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1846 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1847 {
S32, FlatPtr}, {
S64, FlatPtr}})
1848 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1849 {
S32, RegionPtr}, {
S64, RegionPtr}});
1853 getActionDefinitionsBuilder(G_SELECT)
1855 LocalPtr, FlatPtr, PrivatePtr,
1859 .clampScalar(0,
S16,
S64)
1863 .clampMaxNumElements(0,
S32, 2)
1864 .clampMaxNumElements(0, LocalPtr, 2)
1865 .clampMaxNumElements(0, PrivatePtr, 2)
1867 .widenScalarToNextPow2(0)
1872 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1874 if (
ST.has16BitInsts()) {
1875 if (
ST.hasVOP3PInsts()) {
1877 .clampMaxNumElements(0,
S16, 2);
1879 Shifts.legalFor({{
S16,
S16}});
1882 Shifts.widenScalarIf(
1887 const LLT AmountTy = Query.
Types[1];
1892 Shifts.clampScalar(1,
S32,
S32);
1893 Shifts.widenScalarToNextPow2(0, 16);
1894 Shifts.clampScalar(0,
S16,
S64);
1896 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1904 Shifts.clampScalar(1,
S32,
S32);
1905 Shifts.widenScalarToNextPow2(0, 32);
1906 Shifts.clampScalar(0,
S32,
S64);
1908 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1913 Shifts.scalarize(0);
1915 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1916 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1917 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1918 unsigned IdxTypeIdx = 2;
1920 getActionDefinitionsBuilder(
Op)
1922 const LLT EltTy = Query.
Types[EltTypeIdx];
1923 const LLT VecTy = Query.
Types[VecTypeIdx];
1924 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1926 const bool isLegalVecType =
1936 return (EltSize == 32 || EltSize == 64) &&
1952 const LLT EltTy = Query.
Types[EltTypeIdx];
1953 const LLT VecTy = Query.
Types[VecTypeIdx];
1957 const unsigned TargetEltSize =
1958 DstEltSize % 64 == 0 ? 64 : 32;
1959 return std::pair(VecTypeIdx,
1963 .clampScalar(EltTypeIdx,
S32,
S64)
1964 .clampScalar(VecTypeIdx,
S32,
S64)
1965 .clampScalar(IdxTypeIdx,
S32,
S32)
1966 .clampMaxNumElements(VecTypeIdx,
S32, 32)
1975 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1977 const LLT &EltTy = Query.
Types[1].getElementType();
1978 return Query.
Types[0] != EltTy;
1981 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1982 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1983 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1984 getActionDefinitionsBuilder(
Op)
1987 const LLT BigTy = Query.
Types[BigTyIdx];
1993 const LLT LitTy = Query.
Types[LitTyIdx];
1998 .widenScalarToNextPow2(BigTyIdx, 32)
2006 const LLT BigTy = Query.
Types[BigTyIdx];
2007 const LLT LitTy = Query.
Types[LitTyIdx];
2015 getActionDefinitionsBuilder(G_BUILD_VECTOR)
2024 if (
ST.hasScalarPackInsts()) {
2027 .minScalarOrElt(0,
S16)
2030 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2034 BuildVector.customFor({
V2S16,
S16});
2035 BuildVector.minScalarOrElt(0,
S32);
2037 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2045 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2047 .clampMaxNumElements(0,
S32, 32)
2048 .clampMaxNumElements(1,
S16, 2)
2049 .clampMaxNumElements(0,
S16, 64);
2051 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2054 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2055 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
2056 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
2058 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
2059 const LLT Ty = Query.
Types[TypeIdx];
2071 getActionDefinitionsBuilder(
Op)
2075 const LLT BigTy = Query.
Types[BigTyIdx];
2081 .widenScalarToNextPow2(LitTyIdx, 16)
2090 .clampScalar(LitTyIdx,
S32,
S512)
2091 .widenScalarToNextPow2(LitTyIdx, 32)
2095 return notValidElt(Query, LitTyIdx);
2100 return notValidElt(Query, BigTyIdx);
2105 if (
Op == G_MERGE_VALUES) {
2106 Builder.widenScalarIf(
2109 const LLT Ty = Query.
Types[LitTyIdx];
2115 Builder.widenScalarIf(
2117 const LLT Ty = Query.
Types[BigTyIdx];
2123 const LLT &Ty = Query.
Types[BigTyIdx];
2125 if (NewSizeInBits >= 256) {
2127 if (RoundedTo < NewSizeInBits)
2128 NewSizeInBits = RoundedTo;
2130 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2139 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2140 .legalFor({{
S32}, {
S64}})
2141 .clampScalar(0,
S32,
S64);
2143 if (
ST.hasVOP3PInsts()) {
2144 SextInReg.lowerFor({{
V2S16}})
2148 .clampMaxNumElementsStrict(0,
S16, 2);
2149 }
else if (
ST.has16BitInsts()) {
2150 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2154 SextInReg.lowerFor({{
S32}, {
S64}});
2159 .clampScalar(0,
S32,
S64)
2162 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2166 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2167 FSHRActionDefs.legalFor({{
S32,
S32}})
2168 .clampMaxNumElementsStrict(0,
S16, 2);
2169 if (
ST.hasVOP3PInsts())
2171 FSHRActionDefs.scalarize(0).lower();
2173 if (
ST.hasVOP3PInsts()) {
2174 getActionDefinitionsBuilder(G_FSHL)
2176 .clampMaxNumElementsStrict(0,
S16, 2)
2180 getActionDefinitionsBuilder(G_FSHL)
2185 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2188 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
2190 getActionDefinitionsBuilder(G_FENCE)
2193 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2198 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2200 .clampScalar(1,
S32,
S32)
2201 .clampScalar(0,
S32,
S64)
2202 .widenScalarToNextPow2(0)
2205 getActionDefinitionsBuilder(
2209 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2210 G_READ_REGISTER, G_WRITE_REGISTER,
2215 if (
ST.hasIEEEMinimumMaximumInsts()) {
2216 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2217 .legalFor(FPTypesPK16)
2218 .clampMaxNumElements(0,
S16, 2)
2220 }
else if (
ST.hasVOP3PInsts()) {
2221 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2223 .clampMaxNumElementsStrict(0,
S16, 2)
2227 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2229 .clampScalar(0,
S32,
S64)
2233 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2236 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2238 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2239 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2240 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2243 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2245 getActionDefinitionsBuilder(
2246 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2247 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2248 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2249 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2254 getActionDefinitionsBuilder({G_INTRINSIC, G_INTRINSIC_W_SIDE_EFFECTS,
2255 G_INTRINSIC_CONVERGENT,
2256 G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS})
2259 getLegacyLegalizerInfo().computeTables();
2269 switch (
MI.getOpcode()) {
2270 case TargetOpcode::G_ADDRSPACE_CAST:
2272 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2274 case TargetOpcode::G_FCEIL:
2276 case TargetOpcode::G_FREM:
2278 case TargetOpcode::G_INTRINSIC_TRUNC:
2280 case TargetOpcode::G_SITOFP:
2282 case TargetOpcode::G_UITOFP:
2284 case TargetOpcode::G_FPTOSI:
2286 case TargetOpcode::G_FPTOUI:
2288 case TargetOpcode::G_FMINNUM:
2289 case TargetOpcode::G_FMAXNUM:
2290 case TargetOpcode::G_FMINIMUMNUM:
2291 case TargetOpcode::G_FMAXIMUMNUM:
2293 case TargetOpcode::G_EXTRACT:
2295 case TargetOpcode::G_INSERT:
2297 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2299 case TargetOpcode::G_INSERT_VECTOR_ELT:
2301 case TargetOpcode::G_FSIN:
2302 case TargetOpcode::G_FCOS:
2304 case TargetOpcode::G_GLOBAL_VALUE:
2306 case TargetOpcode::G_LOAD:
2307 case TargetOpcode::G_SEXTLOAD:
2308 case TargetOpcode::G_ZEXTLOAD:
2310 case TargetOpcode::G_STORE:
2312 case TargetOpcode::G_FMAD:
2314 case TargetOpcode::G_FDIV:
2316 case TargetOpcode::G_FFREXP:
2318 case TargetOpcode::G_FSQRT:
2320 case TargetOpcode::G_UDIV:
2321 case TargetOpcode::G_UREM:
2322 case TargetOpcode::G_UDIVREM:
2324 case TargetOpcode::G_SDIV:
2325 case TargetOpcode::G_SREM:
2326 case TargetOpcode::G_SDIVREM:
2328 case TargetOpcode::G_ATOMIC_CMPXCHG:
2330 case TargetOpcode::G_FLOG2:
2332 case TargetOpcode::G_FLOG:
2333 case TargetOpcode::G_FLOG10:
2335 case TargetOpcode::G_FEXP2:
2337 case TargetOpcode::G_FEXP:
2338 case TargetOpcode::G_FEXP10:
2340 case TargetOpcode::G_FPOW:
2342 case TargetOpcode::G_FFLOOR:
2344 case TargetOpcode::G_BUILD_VECTOR:
2345 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2347 case TargetOpcode::G_MUL:
2349 case TargetOpcode::G_CTLZ:
2350 case TargetOpcode::G_CTTZ:
2352 case TargetOpcode::G_CTLS:
2354 case TargetOpcode::G_CTLZ_ZERO_POISON:
2356 case TargetOpcode::G_STACKSAVE:
2358 case TargetOpcode::G_GET_FPENV:
2360 case TargetOpcode::G_SET_FPENV:
2362 case TargetOpcode::G_TRAP:
2364 case TargetOpcode::G_DEBUGTRAP:
2384 if (ST.hasApertureRegs()) {
2389 ? AMDGPU::SRC_SHARED_BASE
2390 : AMDGPU::SRC_PRIVATE_BASE;
2391 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2392 !ST.hasGloballyAddressableScratch()) &&
2393 "Cannot use src_private_base with globally addressable scratch!");
2396 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2397 return B.buildUnmerge(
S32, Dst).getReg(1);
2412 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2428 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2431 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2453 B.buildObjectPtrOffset(
2455 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2456 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2464 switch (Def->getOpcode()) {
2465 case AMDGPU::G_FRAME_INDEX:
2466 case AMDGPU::G_GLOBAL_VALUE:
2467 case AMDGPU::G_BLOCK_ADDR:
2469 case AMDGPU::G_CONSTANT: {
2470 const ConstantInt *CI = Def->getOperand(1).getCImm();
2487 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2489 Intrinsic::amdgcn_addrspacecast_nonnull));
2494 :
MI.getOperand(1).getReg();
2498 unsigned SrcAS = SrcTy.getAddressSpace();
2508 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2515 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2517 ST.hasGloballyAddressableScratch()) {
2521 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2523 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2524 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2526 MRI.
setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2528 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2532 return B.buildExtract(Dst, Src, 0).getReg(0);
2538 castFlatToLocalOrPrivate(Dst);
2539 MI.eraseFromParent();
2545 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2546 auto FlatNull =
B.buildConstant(SrcTy, 0);
2549 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2553 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2555 MI.eraseFromParent();
2562 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2565 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2568 ST.hasGloballyAddressableScratch()) {
2573 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2577 if (ST.isWave64()) {
2578 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2584 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2585 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2587 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2591 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2592 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2594 MRI.
setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2595 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2604 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2610 castLocalOrPrivateToFlat(Dst);
2611 MI.eraseFromParent();
2615 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2622 SegmentNull.getReg(0));
2624 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2626 MI.eraseFromParent();
2631 SrcTy.getSizeInBits() == 64) {
2633 B.buildExtract(Dst, Src, 0);
2634 MI.eraseFromParent();
2641 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2642 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2643 if (AddrHiVal == 0) {
2645 B.buildIntToPtr(Dst, Zext);
2647 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2648 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2651 MI.eraseFromParent();
2658 MI.eraseFromParent();
2667 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2672 auto C1 =
B.buildFConstant(Ty, C1Val);
2673 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2676 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2677 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2679 auto C2 =
B.buildFConstant(Ty, C2Val);
2680 auto Fabs =
B.buildFAbs(Ty, Src);
2683 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2684 MI.eraseFromParent();
2702 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2704 const auto Zero =
B.buildFConstant(
S64, 0.0);
2705 const auto One =
B.buildFConstant(
S64, 1.0);
2708 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2709 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2712 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2713 MI.eraseFromParent();
2721 Register Src0Reg =
MI.getOperand(1).getReg();
2722 Register Src1Reg =
MI.getOperand(2).getReg();
2723 auto Flags =
MI.getFlags();
2726 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2727 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2728 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2729 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2730 MI.eraseFromParent();
2736 const unsigned FractBits = 52;
2737 const unsigned ExpBits = 11;
2740 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2741 auto Const1 =
B.buildConstant(
S32, ExpBits);
2743 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2745 .addUse(Const0.getReg(0))
2746 .addUse(Const1.getReg(0));
2748 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2762 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2769 const unsigned FractBits = 52;
2772 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2773 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2775 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2777 const auto Zero32 =
B.buildConstant(
S32, 0);
2780 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2782 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2783 auto Not =
B.buildNot(
S64, Shr);
2784 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2785 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2790 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2791 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2792 MI.eraseFromParent();
2808 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2809 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2812 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2813 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2815 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2816 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2819 B.buildFAdd(Dst, LdExp, CvtLo);
2820 MI.eraseFromParent();
2826 auto One =
B.buildConstant(
S32, 1);
2830 auto ThirtyOne =
B.buildConstant(
S32, 31);
2831 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2832 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2833 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2834 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2835 .addUse(Unmerge.getReg(1));
2836 auto LS2 =
B.buildSub(
S32, LS, One);
2837 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2839 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2840 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2841 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2842 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2843 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2844 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2845 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2846 B.buildFLdexp(Dst, FVal, Scale);
2847 MI.eraseFromParent();
2867 unsigned Flags =
MI.getFlags();
2878 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2886 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2887 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2891 K0 =
B.buildFConstant(
2893 K1 =
B.buildFConstant(
2896 K0 =
B.buildFConstant(
2898 K1 =
B.buildFConstant(
2902 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2903 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2904 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2907 :
B.buildFPTOUI(
S32, FloorMul);
2908 auto Lo =
B.buildFPTOUI(
S32, Fma);
2912 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2914 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2917 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2918 MI.eraseFromParent();
2950 unsigned StartIdx =
Offset / 32;
2952 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32), SrcReg);
2954 if (DstCount == 1) {
2956 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2961 for (
unsigned I = 0;
I < DstCount; ++
I)
2962 MergeVec.
push_back(Unmerge.getReg(StartIdx +
I));
2963 B.buildMergeLikeInstr(DstReg, MergeVec);
2966 MI.eraseFromParent();
2976 Register InsertSrc =
MI.getOperand(2).getReg();
2985 if (
Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2989 unsigned DstCount = DstSize / 32;
2990 unsigned InsertCount = InsertSize / 32;
2991 unsigned StartIdx =
Offset / 32;
2993 auto SrcUnmerge =
B.buildUnmerge(
S32, SrcReg);
2996 for (
unsigned I = 0;
I < StartIdx; ++
I)
2999 if (InsertCount == 1) {
3003 InsertSrc =
B.buildPtrToInt(
S32, InsertSrc).getReg(0);
3006 auto InsertUnmerge =
B.buildUnmerge(
S32, InsertSrc);
3007 for (
unsigned I = 0;
I < InsertCount; ++
I)
3011 for (
unsigned I = StartIdx + InsertCount;
I < DstCount; ++
I)
3014 B.buildMergeLikeInstr(DstReg, MergeVec);
3016 MI.eraseFromParent();
3043 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
3044 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
3045 B.buildIntToPtr(Dst, IntElt);
3047 MI.eraseFromParent();
3054 std::optional<ValueAndVReg> MaybeIdxVal =
3058 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3061 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
3062 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3067 MI.eraseFromParent();
3096 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
3097 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
3098 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3100 B.buildIntToPtr(Dst, IntVecDest);
3101 MI.eraseFromParent();
3108 std::optional<ValueAndVReg> MaybeIdxVal =
3113 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3116 if (IdxVal < NumElts) {
3118 for (
unsigned i = 0; i < NumElts; ++i)
3120 B.buildUnmerge(SrcRegs, Vec);
3122 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
3123 B.buildMergeLikeInstr(Dst, SrcRegs);
3128 MI.eraseFromParent();
3139 unsigned Flags =
MI.getFlags();
3143 if (ST.hasTrigReducedRange()) {
3144 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3145 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3146 .addUse(MulVal.getReg(0))
3150 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3153 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3157 MI.eraseFromParent();
3165 unsigned GAFlags)
const {
3194 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3196 if (ST.has64BitLiterals()) {
3200 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3204 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3213 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3214 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3217 B.buildExtract(DstReg, PCReg, 0);
3227 if (RequiresHighHalf && ST.has64BitLiterals()) {
3229 MRI.
setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3230 B.buildInstr(AMDGPU::S_MOV_B64)
3245 MRI.
setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3248 B.buildInstr(AMDGPU::S_MOV_B32)
3253 if (RequiresHighHalf) {
3255 "Must provide a 64-bit pointer type!");
3258 MRI.
setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3260 B.buildInstr(AMDGPU::S_MOV_B32)
3271 MRI.
setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3273 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3277 if (AddrDst != DstReg)
3278 B.buildCast(DstReg, AddrDst);
3279 }
else if (AddrLo != DstReg) {
3282 B.buildCast(DstReg, AddrLo);
3291 unsigned AS = Ty.getAddressSpace();
3299 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3303 Fn,
"local memory global used by non-kernel function",
3312 B.buildUndef(DstReg);
3313 MI.eraseFromParent();
3337 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3338 B.buildIntToPtr(DstReg, Sz);
3339 MI.eraseFromParent();
3345 MI.eraseFromParent();
3349 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3351 MI.eraseFromParent();
3359 MI.eraseFromParent();
3365 MI.eraseFromParent();
3381 if (Ty.getSizeInBits() == 32) {
3383 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3384 B.buildExtract(DstReg, Load, 0);
3386 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3388 MI.eraseFromParent();
3411 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3413 MI.getOperand(1).setReg(Cast.getReg(0));
3418 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3444 if (WideMemSize == ValSize) {
3450 MI.setMemRefs(MF, {WideMMO});
3456 if (ValSize > WideMemSize)
3463 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3464 B.buildTrunc(ValReg, WideLoad).getReg(0);
3471 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3472 B.buildExtract(ValReg, WideLoad, 0);
3476 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3477 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3481 MI.eraseFromParent();
3494 Register DataReg =
MI.getOperand(0).getReg();
3539 "this should not have been custom lowered");
3544 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3546 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3550 .setMemRefs(
MI.memoperands());
3552 MI.eraseFromParent();
3560 switch (
DefMI->getOpcode()) {
3561 case TargetOpcode::G_INTRINSIC: {
3563 case Intrinsic::amdgcn_frexp_mant:
3564 case Intrinsic::amdgcn_log:
3565 case Intrinsic::amdgcn_log_clamp:
3566 case Intrinsic::amdgcn_exp2:
3567 case Intrinsic::amdgcn_sqrt:
3575 case TargetOpcode::G_FSQRT:
3577 case TargetOpcode::G_FFREXP: {
3578 if (
DefMI->getOperand(0).getReg() == Src)
3582 case TargetOpcode::G_FPEXT: {
3603std::pair<Register, Register>
3605 unsigned Flags)
const {
3610 auto SmallestNormal =
B.buildFConstant(
3612 auto IsLtSmallestNormal =
3615 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3616 auto One =
B.buildFConstant(
F32, 1.0);
3618 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3619 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3621 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3634 LLT Ty =
B.getMRI()->getType(Dst);
3635 unsigned Flags =
MI.getFlags();
3640 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3641 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3642 .addUse(Ext.getReg(0))
3644 B.buildFPTrunc(Dst,
Log2, Flags);
3645 MI.eraseFromParent();
3653 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3656 MI.eraseFromParent();
3660 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3661 .addUse(ScaledInput)
3664 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3665 auto Zero =
B.buildFConstant(Ty, 0.0);
3667 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3668 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3670 MI.eraseFromParent();
3676 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3677 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3682 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3683 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3688 unsigned Flags =
MI.getFlags();
3701 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3703 B.buildFPTrunc(Dst, LogVal);
3708 MI.eraseFromParent();
3717 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3720 if (ST.hasFastFMAF32()) {
3722 const float c_log10 = 0x1.344134p-2f;
3723 const float cc_log10 = 0x1.09f79ep-26f;
3726 const float c_log = 0x1.62e42ep-1f;
3727 const float cc_log = 0x1.efa39ep-25f;
3729 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3730 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3734 R =
B.buildFMul(Ty,
Y,
C, NewFlags).getReg(0);
3735 auto NegR =
B.buildFNeg(Ty, R, NewFlags);
3736 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, NewFlags);
3737 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, NewFlags);
3738 R =
B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3741 const float ch_log10 = 0x1.344000p-2f;
3742 const float ct_log10 = 0x1.3509f6p-18f;
3745 const float ch_log = 0x1.62e000p-1f;
3746 const float ct_log = 0x1.0bfbe8p-15f;
3748 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3749 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3751 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3752 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3753 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3757 auto YTCT =
B.buildFMul(Ty, YT, CT, NewFlags);
3760 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3762 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, NewFlags);
3765 const bool IsFiniteOnly =
3768 if (!IsFiniteOnly) {
3771 auto Fabs =
B.buildFAbs(Ty,
Y);
3774 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3778 auto Zero =
B.buildFConstant(Ty, 0.0);
3780 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3781 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3782 B.buildFSub(Dst, R, Shift, Flags);
3784 B.buildCopy(Dst, R);
3787 MI.eraseFromParent();
3793 unsigned Flags)
const {
3794 const double Log2BaseInverted =
3797 LLT Ty =
B.getMRI()->getType(Dst);
3802 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3805 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3806 auto Zero =
B.buildFConstant(Ty, 0.0);
3808 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3809 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3811 if (ST.hasFastFMAF32())
3812 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3814 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3815 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3823 ?
B.buildFLog2(Ty, Src, Flags)
3824 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3827 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3828 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3839 unsigned Flags =
MI.getFlags();
3840 LLT Ty =
B.getMRI()->getType(Dst);
3850 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3851 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3852 .addUse(Ext.getReg(0))
3854 B.buildFPTrunc(Dst,
Log2, Flags);
3855 MI.eraseFromParent();
3865 MI.eraseFromParent();
3873 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3875 RangeCheckConst, Flags);
3877 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3878 auto Zero =
B.buildFConstant(Ty, 0.0);
3879 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3880 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3882 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3883 .addUse(AddInput.getReg(0))
3886 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3887 auto One =
B.buildFConstant(Ty, 1.0);
3888 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3889 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3890 MI.eraseFromParent();
3895 const SrcOp &Src,
unsigned Flags) {
3896 LLT Ty = Dst.getLLTTy(*
B.getMRI());
3899 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3900 .addUse(Src.getReg())
3903 return B.buildFExp2(Dst, Src, Flags);
3909 bool IsExp10)
const {
3910 LLT Ty =
B.getMRI()->getType(
X);
3914 auto Const =
B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f :
numbers::log2e);
3915 auto Mul =
B.buildFMul(Ty,
X, Const, Flags);
3922 LLT Ty =
B.getMRI()->getType(Dst);
3929 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3932 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3933 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3934 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3937 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3939 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3940 .addUse(ExpInput.getReg(0))
3943 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3944 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3945 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3951 unsigned Flags)
const {
3952 LLT Ty =
B.getMRI()->getType(Dst);
3957 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3958 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3960 auto Mul1 =
B.buildFMul(Ty,
X, K1, Flags);
3961 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3962 auto Mul0 =
B.buildFMul(Ty,
X, K0, Flags);
3963 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3964 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3974 auto Threshold =
B.buildFConstant(Ty, -0x1.2f7030p+5f);
3978 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+5f);
3979 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3980 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X);
3982 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3983 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3985 auto Mul1 =
B.buildFMul(Ty, AdjustedX, K1, Flags);
3986 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3987 auto Mul0 =
B.buildFMul(Ty, AdjustedX, K0, Flags);
3988 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3990 auto MulExps =
B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3991 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.9f623ep-107f);
3992 auto AdjustedResult =
B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3994 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
4013 if (
MI.getOpcode() == TargetOpcode::G_FEXP2) {
4015 Dn =
B.buildFRint(
S64,
X, Flags).getReg(0);
4017 F =
B.buildFSub(
S64,
X, Dn, Flags).getReg(0);
4019 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4020 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4021 auto Mul2 =
B.buildFMul(
S64,
F, C2, Flags).getReg(0);
4022 T =
B.buildFMA(
S64,
F, C1, Mul2, Flags).getReg(0);
4024 }
else if (
MI.getOpcode() == TargetOpcode::G_FEXP10) {
4025 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.a934f0979a371p+1));
4026 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4027 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4029 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4030 auto C2 =
B.buildFConstant(
S64,
APFloat(-0x1.9dc1da994fd21p-59));
4031 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.34413509f79ffp-2));
4032 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4033 F =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4035 auto C4 =
B.buildFConstant(
S64,
APFloat(0x1.26bb1bbb55516p+1));
4036 auto C5 =
B.buildFConstant(
S64,
APFloat(-0x1.f48ad494ea3e9p-53));
4037 auto MulF =
B.buildFMul(
S64,
F, C5, Flags).getReg(0);
4038 T =
B.buildFMA(
S64,
F, C4, MulF, Flags).getReg(0);
4041 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.71547652b82fep+0));
4042 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4043 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4045 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4046 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4047 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4048 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4049 T =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4053 auto P =
B.buildFConstant(
S64, 0x1.ade156a5dcb37p-26);
4054 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.28af3fca7ab0cp-22),
4056 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.71dee623fde64p-19),
4058 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01997c89e6b0p-16),
4060 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01a014761f6ep-13),
4062 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.6c16c1852b7b0p-10),
4064 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.1111111122322p-7), Flags);
4065 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.55555555502a1p-5), Flags);
4066 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.5555555555511p-3), Flags);
4067 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.000000000000bp-1), Flags);
4069 auto One =
B.buildFConstant(
S64, 1.0);
4070 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4071 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4074 auto DnInt =
B.buildFPTOSI(
S32, Dn);
4075 auto Z =
B.buildFLdexp(
S64,
P, DnInt, Flags);
4082 Z =
B.buildSelect(
S64, CondHi, Z, PInf, Flags);
4089 B.buildSelect(
MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4091 MI.eraseFromParent();
4099 const unsigned Flags =
MI.getFlags();
4111 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
4119 MI.eraseFromParent();
4130 auto Ext =
B.buildFPExt(
F32,
X, Flags);
4133 B.buildFPTrunc(Dst, Lowered, Flags);
4134 MI.eraseFromParent();
4145 MI.eraseFromParent();
4173 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
4176 if (ST.hasFastFMAF32()) {
4178 const float cc_exp = 0x1.4ae0bep-26f;
4179 const float c_exp10 = 0x1.a934f0p+1f;
4180 const float cc_exp10 = 0x1.2f346ep-24f;
4182 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4183 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
4184 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
4185 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
4187 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4188 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
4190 const float ch_exp = 0x1.714000p+0f;
4191 const float cl_exp = 0x1.47652ap-12f;
4193 const float ch_exp10 = 0x1.a92000p+1f;
4194 const float cl_exp10 = 0x1.4f0978p-11f;
4196 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
4197 auto XH =
B.buildAnd(Ty,
X, MaskConst);
4198 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
4200 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4201 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
4203 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4204 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
4207 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
4208 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4211 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
4214 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
4215 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
4218 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4219 .addUse(
A.getReg(0))
4221 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
4223 auto UnderflowCheckConst =
4224 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4225 auto Zero =
B.buildFConstant(Ty, 0.0);
4229 R =
B.buildSelect(Ty, Underflow, Zero, R);
4232 auto OverflowCheckConst =
4233 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4238 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
4241 B.buildCopy(Dst, R);
4242 MI.eraseFromParent();
4251 unsigned Flags =
MI.getFlags();
4252 LLT Ty =
B.getMRI()->getType(Dst);
4257 auto Log =
B.buildFLog2(
F32, Src0, Flags);
4258 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4259 .addUse(Log.getReg(0))
4262 B.buildFExp2(Dst,
Mul, Flags);
4263 }
else if (Ty == F16) {
4265 auto Log =
B.buildFLog2(F16, Src0, Flags);
4266 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
4267 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
4268 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4269 .addUse(Ext0.getReg(0))
4270 .addUse(Ext1.getReg(0))
4272 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
4276 MI.eraseFromParent();
4284 ModSrc = SrcFNeg->getOperand(1).getReg();
4286 ModSrc = SrcFAbs->getOperand(1).getReg();
4288 ModSrc = SrcFAbs->getOperand(1).getReg();
4299 Register OrigSrc =
MI.getOperand(1).getReg();
4300 unsigned Flags =
MI.getFlags();
4302 "this should not have been custom lowered");
4312 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
4332 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4334 B.buildFMinNum(Min, Fract, Const, Flags);
4339 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
4342 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
4343 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4345 MI.eraseFromParent();
4361 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4363 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4364 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4367 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4368 B.buildBitcast(Dst,
Merge);
4370 MI.eraseFromParent();
4387 bool UsePartialMad64_32,
4388 bool SeparateOddAlignedProducts)
const {
4403 auto getZero32 = [&]() ->
Register {
4405 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4408 auto getZero64 = [&]() ->
Register {
4410 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4415 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4426 if (CarryIn.empty())
4429 bool HaveCarryOut =
true;
4431 if (CarryIn.size() == 1) {
4433 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4437 CarryAccum = getZero32();
4439 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4440 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4442 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4447 LocalAccum = getZero32();
4448 HaveCarryOut =
false;
4453 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4454 LocalAccum =
Add.getReg(0);
4468 auto buildMadChain =
4471 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4472 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4479 if (LocalAccum.size() == 1 &&
4480 (!UsePartialMad64_32 || !CarryIn.empty())) {
4483 unsigned j1 = DstIndex - j0;
4484 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4488 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4490 LocalAccum[0] =
Mul.getReg(0);
4492 if (CarryIn.empty()) {
4493 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4496 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4502 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4506 if (j0 <= DstIndex) {
4507 bool HaveSmallAccum =
false;
4510 if (LocalAccum[0]) {
4511 if (LocalAccum.size() == 1) {
4512 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4513 HaveSmallAccum =
true;
4514 }
else if (LocalAccum[1]) {
4515 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4516 HaveSmallAccum =
false;
4518 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4519 HaveSmallAccum =
true;
4522 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4524 HaveSmallAccum =
true;
4528 unsigned j1 = DstIndex - j0;
4529 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4533 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4534 {Src0[j0], Src1[j1], Tmp});
4535 Tmp = Mad.getReg(0);
4536 if (!HaveSmallAccum)
4537 CarryOut.push_back(Mad.getReg(1));
4538 HaveSmallAccum =
false;
4541 }
while (j0 <= DstIndex);
4543 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4544 LocalAccum[0] = Unmerge.getReg(0);
4545 if (LocalAccum.size() > 1)
4546 LocalAccum[1] = Unmerge.getReg(1);
4573 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4574 Carry OddCarryIn = std::move(OddCarry);
4575 Carry EvenCarryIn = std::move(EvenCarry);
4580 if (2 * i < Accum.
size()) {
4581 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4582 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4587 if (!SeparateOddAlignedProducts) {
4588 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4589 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4591 bool IsHighest = 2 * i >= Accum.
size();
4594 .take_front(IsHighest ? 1 : 2);
4595 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4601 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4603 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4605 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4608 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4611 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4612 Lo->getOperand(1).getReg());
4613 Accum[2 * i] =
Hi.getReg(0);
4614 SeparateOddCarry =
Hi.getReg(1);
4621 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4622 EvenCarryIn.push_back(CarryOut);
4624 if (2 * i < Accum.
size()) {
4625 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4626 OddCarry.push_back(CarryOut);
4638 assert(ST.hasMad64_32());
4639 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4651 unsigned Size = Ty.getSizeInBits();
4652 if (ST.hasVMulU64Inst() &&
Size == 64)
4655 unsigned NumParts =
Size / 32;
4667 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4671 for (
unsigned i = 0; i < NumParts; ++i) {
4675 B.buildUnmerge(Src0Parts, Src0);
4676 B.buildUnmerge(Src1Parts, Src1);
4679 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4680 SeparateOddAlignedProducts);
4682 B.buildMergeLikeInstr(DstReg, AccumRegs);
4683 MI.eraseFromParent();
4698 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4699 ? AMDGPU::G_AMDGPU_FFBH_U32
4700 : AMDGPU::G_AMDGPU_FFBL_B32;
4701 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4704 MI.eraseFromParent();
4714 TypeSize NumBits = SrcTy.getSizeInBits();
4718 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4719 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4720 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4721 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4722 B.buildTrunc(Dst, Ctlz);
4723 MI.eraseFromParent();
4734 assert(SrcTy ==
S32 &&
"legalizeCTLS only supports s32");
4735 unsigned BitWidth = SrcTy.getSizeInBits();
4737 auto Sffbh =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32}).addUse(Src);
4739 B.buildSub(Dst, Clamped,
B.buildConstant(
S32, 1));
4740 MI.eraseFromParent();
4746 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4749 return ConstVal == -1;
4756 Register CondDef =
MI.getOperand(0).getReg();
4775 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4784 UncondBrTarget = &*NextMBB;
4786 if (
Next->getOpcode() != AMDGPU::G_BR)
4805 *ArgRC,
B.getDebugLoc(), ArgTy);
4809 const unsigned Mask = Arg->
getMask();
4817 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4818 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4821 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4823 B.buildCopy(DstReg, LiveIn);
4833 if (!ST.hasClusters()) {
4836 MI.eraseFromParent();
4856 auto One =
B.buildConstant(
S32, 1);
4857 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4858 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4859 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4866 B.buildCopy(DstReg, GlobalIdXYZ);
4867 MI.eraseFromParent();
4871 B.buildCopy(DstReg, ClusterIdXYZ);
4872 MI.eraseFromParent();
4877 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4879 MRI.
setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4880 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4882 .addImm(ClusterIdField);
4883 auto Zero =
B.buildConstant(
S32, 0);
4886 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4887 MI.eraseFromParent();
4929 auto LoadConstant = [&](
unsigned N) {
4930 B.buildConstant(DstReg,
N);
4934 if (ST.hasArchitectedSGPRs() &&
4941 Arg = &WorkGroupIDX;
4942 ArgRC = &AMDGPU::SReg_32RegClass;
4946 Arg = &WorkGroupIDY;
4947 ArgRC = &AMDGPU::SReg_32RegClass;
4951 Arg = &WorkGroupIDZ;
4952 ArgRC = &AMDGPU::SReg_32RegClass;
4956 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4957 return LoadConstant(0);
4958 Arg = &ClusterWorkGroupIDX;
4959 ArgRC = &AMDGPU::SReg_32RegClass;
4963 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
4964 return LoadConstant(0);
4965 Arg = &ClusterWorkGroupIDY;
4966 ArgRC = &AMDGPU::SReg_32RegClass;
4970 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
4971 return LoadConstant(0);
4972 Arg = &ClusterWorkGroupIDZ;
4973 ArgRC = &AMDGPU::SReg_32RegClass;
4978 return LoadConstant(ClusterDims.
getDims()[0] - 1);
4979 Arg = &ClusterWorkGroupMaxIDX;
4980 ArgRC = &AMDGPU::SReg_32RegClass;
4985 return LoadConstant(ClusterDims.
getDims()[1] - 1);
4986 Arg = &ClusterWorkGroupMaxIDY;
4987 ArgRC = &AMDGPU::SReg_32RegClass;
4992 return LoadConstant(ClusterDims.
getDims()[2] - 1);
4993 Arg = &ClusterWorkGroupMaxIDZ;
4994 ArgRC = &AMDGPU::SReg_32RegClass;
4998 Arg = &ClusterWorkGroupMaxFlatID;
4999 ArgRC = &AMDGPU::SReg_32RegClass;
5014 return LoadConstant(0);
5019 B.buildUndef(DstReg);
5023 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5035 MI.eraseFromParent();
5041 B.buildConstant(
MI.getOperand(0).getReg(),
C);
5042 MI.eraseFromParent();
5049 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
5063 B.buildUndef(DstReg);
5064 MI.eraseFromParent();
5068 if (Arg->isMasked()) {
5082 MI.eraseFromParent();
5097 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
5106 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5114 Align Alignment)
const {
5118 "unexpected kernarg parameter type");
5125 MI.eraseFromParent();
5160 auto FloatY =
B.buildUITOFP(
S32,
Y);
5161 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
5163 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
5164 auto Z =
B.buildFPTOUI(
S32, ScaledY);
5167 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
5168 auto NegYZ =
B.buildMul(
S32, NegY, Z);
5169 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
5172 auto Q =
B.buildUMulH(
S32,
X, Z);
5173 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
5176 auto One =
B.buildConstant(
S32, 1);
5179 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
5185 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
5188 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
5207 auto Unmerge =
B.buildUnmerge(
S32, Val);
5209 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
5210 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
5212 auto Mad =
B.buildFMAD(
5216 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
5217 auto Mul1 =
B.buildFMul(
5221 auto Mul2 =
B.buildFMul(
5223 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
5226 auto Mad2 =
B.buildFMAD(
5230 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
5231 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
5233 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5248 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
5250 auto Zero64 =
B.buildConstant(
S64, 0);
5251 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
5253 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
5254 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
5256 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
5257 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5258 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5260 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
5261 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5262 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
5264 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
5265 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
5266 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
5267 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5268 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5270 auto Zero32 =
B.buildConstant(
S32, 0);
5271 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
5272 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5273 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
5275 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
5276 Register NumerLo = UnmergeNumer.getReg(0);
5277 Register NumerHi = UnmergeNumer.getReg(1);
5279 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
5280 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
5281 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
5282 Register Mul3_Lo = UnmergeMul3.getReg(0);
5283 Register Mul3_Hi = UnmergeMul3.getReg(1);
5284 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
5285 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5286 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
5287 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
5289 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
5290 Register DenomLo = UnmergeDenom.getReg(0);
5291 Register DenomHi = UnmergeDenom.getReg(1);
5294 auto C1 =
B.buildSExt(
S32, CmpHi);
5297 auto C2 =
B.buildSExt(
S32, CmpLo);
5300 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
5307 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
5308 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5309 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5310 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
5312 auto One64 =
B.buildConstant(
S64, 1);
5313 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
5319 auto C6 =
B.buildSelect(
5323 auto Add4 =
B.buildAdd(
S64, Add3, One64);
5324 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
5326 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5327 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5328 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
5334 auto Sel1 =
B.buildSelect(
5341 auto Sel2 =
B.buildSelect(
5352 switch (
MI.getOpcode()) {
5355 case AMDGPU::G_UDIV: {
5356 DstDivReg =
MI.getOperand(0).getReg();
5359 case AMDGPU::G_UREM: {
5360 DstRemReg =
MI.getOperand(0).getReg();
5363 case AMDGPU::G_UDIVREM: {
5364 DstDivReg =
MI.getOperand(0).getReg();
5365 DstRemReg =
MI.getOperand(1).getReg();
5372 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5373 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
5374 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5384 MI.eraseFromParent();
5395 if (Ty !=
S32 && Ty !=
S64)
5398 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5399 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5400 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5402 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5403 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5404 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5406 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5407 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5409 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5410 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5412 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5413 switch (
MI.getOpcode()) {
5416 case AMDGPU::G_SDIV: {
5417 DstDivReg =
MI.getOperand(0).getReg();
5421 case AMDGPU::G_SREM: {
5422 DstRemReg =
MI.getOperand(0).getReg();
5426 case AMDGPU::G_SDIVREM: {
5427 DstDivReg =
MI.getOperand(0).getReg();
5428 DstRemReg =
MI.getOperand(1).getReg();
5441 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5442 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5443 B.buildSub(DstDivReg, SignXor, Sign);
5447 auto Sign = LHSign.getReg(0);
5448 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5449 B.buildSub(DstRemReg, SignXor, Sign);
5452 MI.eraseFromParent();
5468 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5479 if (CLHS->isExactlyValue(1.0)) {
5480 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5484 MI.eraseFromParent();
5489 if (CLHS->isExactlyValue(-1.0)) {
5490 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5491 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5492 .addUse(FNeg.getReg(0))
5495 MI.eraseFromParent();
5502 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5507 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5510 B.buildFMul(Res, LHS, RCP, Flags);
5512 MI.eraseFromParent();
5527 if (!AllowInaccurateRcp)
5535 X =
B.buildFConstant(ResTy, 1.0).getReg(0);
5537 Register NegY = IsNegRcp ?
Y :
B.buildFNeg(ResTy,
Y).getReg(0);
5538 auto One =
B.buildFConstant(ResTy, 1.0);
5540 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5544 R =
B.buildFNeg(ResTy, R);
5546 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5547 R =
B.buildFMA(ResTy, Tmp0, R, R);
5549 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5550 R =
B.buildFMA(ResTy, Tmp1, R, R);
5554 B.buildCopy(Res, R);
5555 MI.eraseFromParent();
5559 auto Ret =
B.buildFMul(ResTy,
X, R);
5560 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5562 B.buildFMA(Res, Tmp2, R, Ret);
5563 MI.eraseFromParent();
5595 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5596 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5597 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5598 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5599 .addUse(RHSExt.getReg(0))
5601 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5603 if (ST.hasMadMacF32Insts()) {
5604 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5605 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5606 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5608 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5609 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5610 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5612 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5613 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5614 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5615 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5616 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5617 .addUse(RDst.getReg(0))
5622 MI.eraseFromParent();
5635 unsigned SPDenormMode =
5638 if (ST.hasDenormModeInst()) {
5640 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5642 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5643 B.buildInstr(AMDGPU::S_DENORM_MODE)
5644 .addImm(NewDenormModeValue);
5647 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5648 .addImm(SPDenormMode)
5670 auto One =
B.buildFConstant(
S32, 1.0f);
5672 auto DenominatorScaled =
5673 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5678 auto NumeratorScaled =
5679 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5685 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5686 .addUse(DenominatorScaled.getReg(0))
5688 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5691 const bool HasDynamicDenormals =
5696 if (!PreservesDenormals) {
5697 if (HasDynamicDenormals) {
5699 B.buildInstr(AMDGPU::S_GETREG_B32)
5700 .addDef(SavedSPDenormMode)
5706 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5707 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5708 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5709 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5710 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5711 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5713 if (!PreservesDenormals) {
5714 if (HasDynamicDenormals) {
5715 assert(SavedSPDenormMode);
5716 B.buildInstr(AMDGPU::S_SETREG_B32)
5717 .addReg(SavedSPDenormMode)
5723 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5724 .addUse(Fma4.getReg(0))
5725 .addUse(Fma1.getReg(0))
5726 .addUse(Fma3.getReg(0))
5727 .addUse(NumeratorScaled.getReg(1))
5730 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5731 .addUse(Fmas.getReg(0))
5736 MI.eraseFromParent();
5755 auto One =
B.buildFConstant(
S64, 1.0);
5757 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5763 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5765 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5766 .addUse(DivScale0.getReg(0))
5769 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5770 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5771 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5773 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5779 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5780 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5781 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5784 if (!ST.hasUsableDivScaleConditionOutput()) {
5790 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5791 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5792 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5793 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5796 Scale1Unmerge.getReg(1));
5798 Scale0Unmerge.getReg(1));
5799 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5801 Scale = DivScale1.getReg(1);
5804 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5805 .addUse(Fma4.getReg(0))
5806 .addUse(Fma3.getReg(0))
5807 .addUse(
Mul.getReg(0))
5811 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5812 .addUse(Fmas.getReg(0))
5817 MI.eraseFromParent();
5832 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5835 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5839 if (ST.hasFractBug()) {
5840 auto Fabs =
B.buildFAbs(Ty, Val);
5844 auto Zero =
B.buildConstant(InstrExpTy, 0);
5845 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5846 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5849 B.buildCopy(Res0, Mant);
5850 B.buildSExtOrTrunc(Res1, Exp);
5852 MI.eraseFromParent();
5867 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5870 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5871 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5872 auto C2 =
B.buildFConstant(
S32, 1.0f);
5875 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5877 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5879 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5880 .addUse(Mul0.getReg(0))
5883 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5885 B.buildFMul(Res, Sel, Mul1, Flags);
5887 MI.eraseFromParent();
5896 unsigned Flags =
MI.getFlags();
5897 assert(!ST.has16BitInsts());
5899 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5900 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5901 .addUse(Ext.getReg(0))
5903 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5904 MI.eraseFromParent();
5914 const unsigned Flags =
MI.getFlags();
5923 MI.eraseFromParent();
5927 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5929 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5930 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5931 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5936 .addUse(SqrtX.getReg(0))
5939 auto NegOne =
B.buildConstant(I32, -1);
5940 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5942 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5943 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5945 auto PosOne =
B.buildConstant(I32, 1);
5946 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5948 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5949 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5951 auto Zero =
B.buildFConstant(
F32, 0.0f);
5955 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5959 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5962 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5963 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5965 auto Half =
B.buildFConstant(
F32, 0.5f);
5966 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5967 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5968 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5969 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5970 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5971 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5972 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5973 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5976 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5978 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5980 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5983 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5985 MI.eraseFromParent();
6020 unsigned Flags =
MI.getFlags();
6025 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
6027 ZeroInt =
B.buildConstant(
S32, 0).getReg(0);
6031 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
6032 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
6033 SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags).getReg(0);
6036 auto SqrtY =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX);
6038 auto Half =
B.buildFConstant(
F64, 0.5);
6039 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
6040 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
6042 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
6043 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
6045 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
6046 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
6048 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
6049 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
6051 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
6053 Register SqrtRet = SqrtS2.getReg(0);
6055 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
6056 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
6057 auto SqrtD2 =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
6060 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
6061 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
6062 SqrtRet =
B.buildFLdexp(
F64, SqrtD2, ScaleDown, Flags).getReg(0);
6067 auto ZeroFP =
B.buildFConstant(
F64, 0.0);
6076 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6078 MI.eraseFromParent();
6109 auto Flags =
MI.getFlags();
6121 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6131 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6132 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6137 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6139 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6140 MI.eraseFromParent();
6152 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6153 IID == Intrinsic::amdgcn_permlanex16;
6154 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6155 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6156 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
6157 IID == Intrinsic::amdgcn_permlane_up ||
6158 IID == Intrinsic::amdgcn_permlane_down ||
6159 IID == Intrinsic::amdgcn_permlane_xor;
6163 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
6165 case Intrinsic::amdgcn_readfirstlane:
6166 case Intrinsic::amdgcn_permlane64:
6167 return LaneOp.getReg(0);
6168 case Intrinsic::amdgcn_readlane:
6169 case Intrinsic::amdgcn_set_inactive:
6170 case Intrinsic::amdgcn_set_inactive_chain_arg:
6171 return LaneOp.addUse(Src1).getReg(0);
6172 case Intrinsic::amdgcn_writelane:
6173 case Intrinsic::amdgcn_permlane_bcast:
6174 case Intrinsic::amdgcn_permlane_up:
6175 case Intrinsic::amdgcn_permlane_down:
6176 case Intrinsic::amdgcn_permlane_xor:
6177 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6178 case Intrinsic::amdgcn_permlane16:
6179 case Intrinsic::amdgcn_permlanex16: {
6181 int64_t Src4 =
MI.getOperand(6).getImm();
6182 int64_t Src5 =
MI.getOperand(7).getImm();
6183 return LaneOp.addUse(Src1)
6190 case Intrinsic::amdgcn_mov_dpp8:
6191 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
6192 case Intrinsic::amdgcn_update_dpp:
6193 return LaneOp.addUse(Src1)
6194 .addImm(
MI.getOperand(4).getImm())
6195 .addImm(
MI.getOperand(5).getImm())
6196 .addImm(
MI.getOperand(6).getImm())
6197 .addImm(
MI.getOperand(7).getImm())
6207 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6208 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
6209 IsPermlaneShuffle) {
6210 Src1 =
MI.getOperand(3).getReg();
6211 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ||
6212 IsPermlaneShuffle) {
6213 Src2 =
MI.getOperand(4).getReg();
6218 unsigned Size = Ty.getSizeInBits();
6220 unsigned SplitSize = 32;
6221 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
6222 ST.hasDPALU_DPP() &&
6226 if (
Size == SplitSize) {
6232 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
6234 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6237 if (IID == Intrinsic::amdgcn_writelane)
6240 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
6241 B.buildTrunc(DstReg, LaneOpDst);
6242 MI.eraseFromParent();
6246 if (
Size % SplitSize != 0)
6250 bool NeedsBitcast =
false;
6251 if (Ty.isVector()) {
6254 if (EltSize == SplitSize) {
6255 PartialResTy = EltTy;
6256 }
else if (EltSize == 16 || EltSize == 32) {
6257 unsigned NElem = SplitSize / EltSize;
6261 NeedsBitcast =
true;
6266 unsigned NumParts =
Size / SplitSize;
6270 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6271 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
6273 if (IID == Intrinsic::amdgcn_writelane)
6274 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
6276 for (
unsigned i = 0; i < NumParts; ++i) {
6277 Src0 = Src0Parts.
getReg(i);
6279 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6280 Src1 = Src1Parts.
getReg(i);
6282 if (IID == Intrinsic::amdgcn_writelane)
6283 Src2 = Src2Parts.
getReg(i);
6285 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6289 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
6292 B.buildMergeLikeInstr(DstReg, PartialRes);
6294 MI.eraseFromParent();
6302 ST.getTargetLowering()->getImplicitParameterOffset(
6312 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6313 B.buildConstant(IdxTy,
Offset).getReg(0));
6324 Register Pointer =
MI.getOperand(2).getReg();
6326 Register NumRecords =
MI.getOperand(4).getReg();
6332 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6334 auto ExtStride =
B.buildAnyExt(
S32, Stride);
6336 if (ST.has45BitNumRecordsBufferResource()) {
6341 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
6342 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
6343 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
6344 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
6348 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
6349 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
6350 auto ExtShiftedStride =
6351 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
6352 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
6353 auto ExtShiftedFlags =
6354 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
6355 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
6357 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
6358 B.buildMergeValues(Result, {LowHalf, HighHalf});
6360 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
6361 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
6362 auto LowHalf = Unmerge.getReg(0);
6363 auto HighHalf = Unmerge.getReg(1);
6365 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
6366 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
6367 auto ShiftConst =
B.buildConstant(
S32, 16);
6368 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
6369 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
6370 Register NewHighHalfReg = NewHighHalf.getReg(0);
6371 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6374 MI.eraseFromParent();
6391 MI.eraseFromParent();
6399 std::optional<uint32_t> KnownSize =
6401 if (KnownSize.has_value())
6402 B.buildConstant(DstReg, *KnownSize);
6420 MI.eraseFromParent();
6427 unsigned AddrSpace)
const {
6429 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6433 ST.hasGloballyAddressableScratch()) {
6435 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6436 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6438 MRI.
setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6440 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6442 B.buildConstant(
S32, 1u << 26));
6447 MI.eraseFromParent();
6457std::pair<Register, unsigned>
6469 bool CheckNUW = ST.hasGFX1250Insts();
6471 MRI, OrigOffset,
nullptr, CheckNUW);
6475 BaseReg =
B.buildPtrToInt(MRI.
getType(OrigOffset), BaseReg).getReg(0);
6485 unsigned Overflow = ImmOffset & ~MaxImm;
6486 ImmOffset -= Overflow;
6487 if ((int32_t)Overflow < 0) {
6488 Overflow += ImmOffset;
6492 if (Overflow != 0) {
6494 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6496 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6497 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6502 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6504 return std::pair(BaseReg, ImmOffset);
6511 bool ImageStore)
const {
6517 if (ST.hasUnpackedD16VMem()) {
6518 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6521 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6522 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6530 if (ImageStore && ST.hasImageStoreD16Bug()) {
6533 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6535 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6542 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6543 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6545 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6553 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6554 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6556 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6573 bool IsFormat)
const {
6585 VData =
B.buildBitcast(Ty, VData).getReg(0);
6593 if (Ty.isVector()) {
6594 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6606 bool IsFormat)
const {
6613 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6628 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6631 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6635 VIndex =
MI.getOperand(3).getReg();
6638 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6641 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6642 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6646 Format =
MI.getOperand(5 + OpOffset).getImm();
6650 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6656 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6657 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6658 }
else if (IsFormat) {
6659 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6660 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6664 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6667 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6670 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6675 auto MIB =
B.buildInstr(
Opc)
6686 MIB.addImm(AuxiliaryData)
6687 .addImm(HasVIndex ? -1 : 0)
6688 .addMemOperand(MMO);
6690 MI.eraseFromParent();
6696 unsigned ImmOffset,
unsigned Format,
6699 auto MIB =
B.buildInstr(
Opc)
6710 MIB.addImm(AuxiliaryData)
6711 .addImm(HasVIndex ? -1 : 0)
6712 .addMemOperand(MMO);
6718 bool IsTyped)
const {
6732 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6733 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6735 StatusDst =
MI.getOperand(1).getReg();
6740 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6743 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6746 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6749 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6752 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6755 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6756 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6760 Format =
MI.getOperand(5 + OpOffset).getImm();
6764 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6774 Dst =
MI.getOperand(0).getReg();
6775 B.setInsertPt(
B.getMBB(),
MI);
6782 Dst =
MI.getOperand(0).getReg();
6783 B.setInsertPt(
B.getMBB(),
MI);
6787 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6788 const bool Unpacked = ST.hasUnpackedD16VMem();
6798 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6799 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6800 }
else if (IsFormat) {
6804 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6806 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6807 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6812 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6813 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6816 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6817 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6820 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6821 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6827 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6828 unsigned NumLoadDWords = NumValueDWords + 1;
6830 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6832 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6834 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6835 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6836 B.buildTrunc(Dst, ExtDst);
6837 }
else if (NumValueDWords == 1) {
6838 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6841 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6842 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6844 B.buildUnmerge(LoadElts, LoadDstReg);
6846 B.buildMergeLikeInstr(Dst, LoadElts);
6849 (IsD16 && !Ty.isVector())) {
6850 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6852 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6853 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6854 B.buildTrunc(Dst, LoadDstReg);
6855 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6857 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6859 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6860 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6862 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6864 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6865 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6866 B.buildMergeLikeInstr(Dst, Repack);
6869 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6872 MI.eraseFromParent();
6878 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6879 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6880 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6881 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6882 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6883 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6884 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6885 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6886 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6887 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6888 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6889 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6890 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6891 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6892 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6893 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6895 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6896 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6897 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6898 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6899 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6900 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6901 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6902 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6903 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6904 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6905 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6906 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6907 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6908 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6909 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6910 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6911 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6912 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6913 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6914 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6915 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6916 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6917 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6918 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6919 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6920 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6921 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6922 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6923 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6924 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6925 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6926 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6927 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6928 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6929 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6930 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6931 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6932 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6933 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6934 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6935 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6936 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6937 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6938 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6939 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6940 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6941 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6942 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6943 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6944 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6945 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6946 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6947 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6948 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6949 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6950 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6951 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6952 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6953 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6954 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6955 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6956 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6957 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6958 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6959 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6960 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6961 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6962 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6963 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6964 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6965 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6966 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6967 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6976 const bool IsCmpSwap =
6977 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6978 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6979 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6980 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6991 CmpVal =
MI.getOperand(3).getReg();
6996 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6997 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
7000 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
7003 VIndex =
MI.getOperand(4 + OpOffset).getReg();
7006 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
7009 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
7010 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
7011 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
7030 .addImm(AuxiliaryData)
7031 .addImm(HasVIndex ? -1 : 0)
7032 .addMemOperand(MMO);
7034 MI.eraseFromParent();
7044 bool IsA16,
bool IsG16) {
7060 (
B.getMRI()->getType(AddrReg) ==
S16)) {
7065 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7069 "Bias needs to be converted to 16 bit in A16 mode");
7071 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
7077 if (((
I + 1) >= EndIdx) ||
7084 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
7086 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7091 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7102 int DimIdx,
int NumVAddrs) {
7106 for (
int I = 0;
I != NumVAddrs; ++
I) {
7108 if (
SrcOp.isReg()) {
7114 int NumAddrRegs = AddrRegs.
size();
7115 if (NumAddrRegs != 1) {
7118 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7121 for (
int I = 1;
I != NumVAddrs; ++
I) {
7124 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
7146 const unsigned NumDefs =
MI.getNumExplicitDefs();
7147 const unsigned ArgOffset = NumDefs + 1;
7148 bool IsTFE = NumDefs == 2;
7166 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7170 const bool IsAtomicPacked16Bit =
7171 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7172 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7180 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
7181 const bool IsA16 = AddrTy ==
S16;
7182 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
7185 if (!BaseOpcode->
Atomic) {
7186 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
7189 }
else if (DMask != 0) {
7191 }
else if (!IsTFE && !BaseOpcode->
Store) {
7193 B.buildUndef(
MI.getOperand(0));
7194 MI.eraseFromParent();
7202 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7203 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7204 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7205 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7206 unsigned NewOpcode = LoadOpcode;
7207 if (BaseOpcode->
Store)
7208 NewOpcode = StoreOpcode;
7210 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7213 MI.setDesc(
B.getTII().get(NewOpcode));
7217 if (IsTFE && DMask == 0) {
7220 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
7223 if (BaseOpcode->
Atomic) {
7228 if (Ty.isVector() && !IsAtomicPacked16Bit)
7235 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
7236 MI.getOperand(2).setReg(
Concat.getReg(0));
7237 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7241 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
7244 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7250 if (IsA16 && !ST.hasA16()) {
7255 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
7256 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7258 if (IsA16 || IsG16) {
7266 const bool UseNSA = ST.hasNSAEncoding() &&
7267 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
7268 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
7269 const bool UsePartialNSA =
7270 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
7272 if (UsePartialNSA) {
7276 auto Concat =
B.buildConcatVectors(
7277 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7278 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
7279 PackedRegs.
resize(NSAMaxSize);
7280 }
else if (!UseNSA && PackedRegs.
size() > 1) {
7282 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
7283 PackedRegs[0] =
Concat.getReg(0);
7287 const unsigned NumPacked = PackedRegs.
size();
7290 if (!
SrcOp.isReg()) {
7300 SrcOp.setReg(AMDGPU::NoRegister);
7317 const bool UseNSA = ST.hasNSAEncoding() &&
7318 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7319 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7320 const bool UsePartialNSA =
7321 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7323 if (UsePartialNSA) {
7325 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
7327 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
7342 if (!Ty.isVector() || !IsD16)
7346 if (RepackedReg != VData) {
7347 MI.getOperand(1).setReg(RepackedReg);
7355 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7358 if (NumElts < DMaskLanes)
7361 if (NumElts > 4 || DMaskLanes > 4)
7371 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7372 const LLT AdjustedTy =
7388 if (IsD16 && ST.hasUnpackedD16VMem()) {
7395 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
7396 unsigned RoundedSize = 32 * RoundedElts;
7400 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
7405 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
7411 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
7415 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7416 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
7420 MI.getOperand(0).setReg(NewResultReg);
7428 Dst1Reg =
MI.getOperand(1).getReg();
7433 MI.removeOperand(1);
7437 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7446 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7448 if (ResultNumRegs == 1) {
7450 ResultRegs[0] = NewResultReg;
7453 for (
int I = 0;
I != NumDataRegs; ++
I)
7455 B.buildUnmerge(ResultRegs, NewResultReg);
7460 ResultRegs.
resize(NumDataRegs);
7465 if (IsD16 && !Ty.isVector()) {
7466 B.buildTrunc(DstReg, ResultRegs[0]);
7471 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7472 B.buildBitcast(DstReg, ResultRegs[0]);
7484 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7486 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7487 }
else if (ST.hasUnpackedD16VMem()) {
7489 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7493 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7497 for (
int I = 0;
I != NumElts; ++
I)
7504 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7505 B.buildBuildVector(DstReg, ResultRegs);
7509 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7510 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7516 if (ResultRegs.
size() == 1) {
7517 NewResultReg = ResultRegs[0];
7518 }
else if (ResultRegs.
size() == 2) {
7520 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7528 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7530 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7535 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7536 B.buildConcatVectors(DstReg, ResultRegs);
7545 Register OrigDst =
MI.getOperand(0).getReg();
7547 LLT Ty =
B.getMRI()->getType(OrigDst);
7548 unsigned Size = Ty.getSizeInBits();
7551 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7553 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7554 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7557 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7559 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7568 B.setInsertPt(
B.getMBB(),
MI);
7573 B.setInsertPt(
B.getMBB(),
MI);
7579 MI.setDesc(
B.getTII().get(
Opc));
7580 MI.removeOperand(1);
7583 const unsigned MemSize = (
Size + 7) / 8;
7584 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7591 MI.addMemOperand(MF, MMO);
7592 if (Dst != OrigDst) {
7593 MI.getOperand(0).setReg(Dst);
7594 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7595 B.buildTrunc(OrigDst, Dst);
7617 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7618 MI.removeOperand(0);
7628 if (!ST.hasTrapHandler() ||
7632 return ST.supportsGetDoorbellID() ?
7645 MI.eraseFromParent();
7655 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7657 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7661 MI.eraseFromParent();
7670 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7677 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7697 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7700 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7701 B.buildCopy(SGPR01, Temp);
7702 B.buildInstr(AMDGPU::S_TRAP)
7705 MI.eraseFromParent();
7716 B.buildCopy(SGPR01, LiveIn);
7717 B.buildInstr(AMDGPU::S_TRAP)
7721 MI.eraseFromParent();
7730 if (ST.hasPrivEnabledTrap2NopBug()) {
7731 ST.getInstrInfo()->insertSimulatedTrap(MRI,
B.getMBB(),
MI,
7733 MI.eraseFromParent();
7737 B.buildInstr(AMDGPU::S_TRAP)
7739 MI.eraseFromParent();
7748 if (!ST.hasTrapHandler() ||
7752 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7755 B.buildInstr(AMDGPU::S_TRAP)
7759 MI.eraseFromParent();
7772 Register NodePtr =
MI.getOperand(2).getReg();
7773 Register RayExtent =
MI.getOperand(3).getReg();
7774 Register RayOrigin =
MI.getOperand(4).getReg();
7776 Register RayInvDir =
MI.getOperand(6).getReg();
7779 if (!ST.hasGFX10_AEncoding()) {
7782 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7791 const unsigned NumVDataDwords = 4;
7792 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7793 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7795 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7797 const unsigned BaseOpcodes[2][2] = {
7798 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7799 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7800 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7804 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7805 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7806 : AMDGPU::MIMGEncGfx10NSA,
7807 NumVDataDwords, NumVAddrDwords);
7811 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7812 : AMDGPU::MIMGEncGfx10Default,
7813 NumVDataDwords, NumVAddrDwords);
7818 if (UseNSA && IsGFX11Plus) {
7820 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7821 auto Merged =
B.buildMergeLikeInstr(
7822 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7823 Ops.push_back(Merged.getReg(0));
7826 Ops.push_back(NodePtr);
7827 Ops.push_back(RayExtent);
7828 packLanes(RayOrigin);
7831 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7832 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7833 auto MergedDir =
B.buildMergeLikeInstr(
7836 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7837 UnmergeRayDir.getReg(0)}))
7840 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7841 UnmergeRayDir.getReg(1)}))
7844 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7845 UnmergeRayDir.getReg(2)}))
7847 Ops.push_back(MergedDir.getReg(0));
7850 packLanes(RayInvDir);
7854 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7855 Ops.push_back(Unmerge.getReg(0));
7856 Ops.push_back(Unmerge.getReg(1));
7858 Ops.push_back(NodePtr);
7860 Ops.push_back(RayExtent);
7863 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7864 Ops.push_back(Unmerge.getReg(0));
7865 Ops.push_back(Unmerge.getReg(1));
7866 Ops.push_back(Unmerge.getReg(2));
7869 packLanes(RayOrigin);
7871 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7872 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7876 B.buildMergeLikeInstr(R1,
7877 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7878 B.buildMergeLikeInstr(
7879 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7880 B.buildMergeLikeInstr(
7881 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7887 packLanes(RayInvDir);
7894 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7896 Ops.push_back(MergedOps);
7899 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7908 .addImm(IsA16 ? 1 : 0)
7911 MI.eraseFromParent();
7921 Register DstOrigin =
MI.getOperand(1).getReg();
7923 Register NodePtr =
MI.getOperand(4).getReg();
7924 Register RayExtent =
MI.getOperand(5).getReg();
7925 Register InstanceMask =
MI.getOperand(6).getReg();
7926 Register RayOrigin =
MI.getOperand(7).getReg();
7928 Register Offsets =
MI.getOperand(9).getReg();
7929 Register TDescr =
MI.getOperand(10).getReg();
7931 if (!ST.hasBVHDualAndBVH8Insts()) {
7934 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7939 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7940 const unsigned NumVDataDwords = 10;
7941 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7943 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7944 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7945 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7948 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7949 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7951 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7952 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7958 .addUse(RayExtentInstanceMaskVec.getReg(0))
7965 MI.eraseFromParent();
7974 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7975 MI.eraseFromParent();
7982 if (!ST.hasArchitectedSGPRs())
7986 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7987 auto LSB =
B.buildConstant(
S32, 25);
7988 auto Width =
B.buildConstant(
S32, 5);
7989 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7990 MI.eraseFromParent();
7998 unsigned Width)
const {
8002 MRI.
setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
8003 B.buildInstr(AMDGPU::S_GETREG_B32_const)
8006 MI.eraseFromParent();
8024 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
8028 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
8031 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
8032 MI.eraseFromParent();
8043 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
8047 .addReg(Unmerge.getReg(0));
8051 .addReg(Unmerge.getReg(1));
8052 MI.eraseFromParent();
8064 case Intrinsic::sponentry:
8070 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8073 B.buildIntToPtr(DstReg, TmpReg);
8074 MI.eraseFromParent();
8076 int FI =
B.getMF().getFrameInfo().CreateFixedObject(
8078 B.buildFrameIndex(
MI.getOperand(0), FI);
8079 MI.eraseFromParent();
8082 case Intrinsic::amdgcn_if:
8083 case Intrinsic::amdgcn_else: {
8086 bool Negated =
false;
8098 std::swap(CondBrTarget, UncondBrTarget);
8100 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8101 if (IntrID == Intrinsic::amdgcn_if) {
8102 B.buildInstr(AMDGPU::SI_IF)
8105 .addMBB(UncondBrTarget);
8107 B.buildInstr(AMDGPU::SI_ELSE)
8110 .addMBB(UncondBrTarget);
8119 B.buildBr(*CondBrTarget);
8124 MI.eraseFromParent();
8125 BrCond->eraseFromParent();
8131 case Intrinsic::amdgcn_loop: {
8134 bool Negated =
false;
8144 std::swap(CondBrTarget, UncondBrTarget);
8146 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8147 B.buildInstr(AMDGPU::SI_LOOP)
8149 .addMBB(UncondBrTarget);
8154 B.buildBr(*CondBrTarget);
8156 MI.eraseFromParent();
8157 BrCond->eraseFromParent();
8164 case Intrinsic::amdgcn_addrspacecast_nonnull:
8166 case Intrinsic::amdgcn_make_buffer_rsrc:
8168 case Intrinsic::amdgcn_kernarg_segment_ptr:
8171 B.buildConstant(
MI.getOperand(0).getReg(), 0);
8172 MI.eraseFromParent();
8178 case Intrinsic::amdgcn_implicitarg_ptr:
8180 case Intrinsic::amdgcn_workitem_id_x:
8183 case Intrinsic::amdgcn_workitem_id_y:
8186 case Intrinsic::amdgcn_workitem_id_z:
8189 case Intrinsic::amdgcn_workgroup_id_x:
8194 case Intrinsic::amdgcn_workgroup_id_y:
8199 case Intrinsic::amdgcn_workgroup_id_z:
8204 case Intrinsic::amdgcn_cluster_id_x:
8205 return ST.hasClusters() &&
8208 case Intrinsic::amdgcn_cluster_id_y:
8209 return ST.hasClusters() &&
8212 case Intrinsic::amdgcn_cluster_id_z:
8213 return ST.hasClusters() &&
8216 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8217 return ST.hasClusters() &&
8220 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8221 return ST.hasClusters() &&
8224 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8225 return ST.hasClusters() &&
8228 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8229 return ST.hasClusters() &&
8231 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8232 return ST.hasClusters() &&
8235 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8236 return ST.hasClusters() &&
8239 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8240 return ST.hasClusters() &&
8243 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8244 return ST.hasClusters() &&
8248 case Intrinsic::amdgcn_wave_id:
8250 case Intrinsic::amdgcn_lds_kernel_id:
8253 case Intrinsic::amdgcn_dispatch_ptr:
8256 case Intrinsic::amdgcn_queue_ptr:
8259 case Intrinsic::amdgcn_implicit_buffer_ptr:
8262 case Intrinsic::amdgcn_dispatch_id:
8265 case Intrinsic::r600_read_ngroups_x:
8269 case Intrinsic::r600_read_ngroups_y:
8272 case Intrinsic::r600_read_ngroups_z:
8275 case Intrinsic::r600_read_local_size_x:
8278 case Intrinsic::r600_read_local_size_y:
8282 case Intrinsic::r600_read_local_size_z:
8285 case Intrinsic::amdgcn_fdiv_fast:
8287 case Intrinsic::amdgcn_is_shared:
8289 case Intrinsic::amdgcn_is_private:
8291 case Intrinsic::amdgcn_wavefrontsize: {
8292 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
8293 MI.eraseFromParent();
8296 case Intrinsic::amdgcn_s_buffer_load:
8298 case Intrinsic::amdgcn_raw_buffer_store:
8299 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8300 case Intrinsic::amdgcn_struct_buffer_store:
8301 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8303 case Intrinsic::amdgcn_raw_buffer_store_format:
8304 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8305 case Intrinsic::amdgcn_struct_buffer_store_format:
8306 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8308 case Intrinsic::amdgcn_raw_tbuffer_store:
8309 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8310 case Intrinsic::amdgcn_struct_tbuffer_store:
8311 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8313 case Intrinsic::amdgcn_raw_buffer_load:
8314 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8315 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8316 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8317 case Intrinsic::amdgcn_struct_buffer_load:
8318 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8319 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8320 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8322 case Intrinsic::amdgcn_raw_buffer_load_format:
8323 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8324 case Intrinsic::amdgcn_struct_buffer_load_format:
8325 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8327 case Intrinsic::amdgcn_raw_tbuffer_load:
8328 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8329 case Intrinsic::amdgcn_struct_tbuffer_load:
8330 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8332 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8333 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8334 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8335 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8336 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8337 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8338 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8339 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8340 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8341 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8342 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8343 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8344 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8345 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8346 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8347 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8348 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8349 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8350 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8351 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8352 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8353 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8354 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8355 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8356 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8357 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8358 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8359 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8360 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8361 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8362 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8363 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8364 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8365 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8366 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8367 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8368 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8369 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8370 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8371 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8372 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8373 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8374 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8375 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8376 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8377 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8378 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8379 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8380 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8381 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8382 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8383 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8384 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8385 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8386 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8387 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8388 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8389 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8390 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8391 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8392 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8393 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8394 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8395 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8396 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8397 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8398 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8399 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8400 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8401 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8402 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8403 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8405 case Intrinsic::amdgcn_rsq_clamp:
8407 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8409 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8410 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8412 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8413 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8414 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8415 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8416 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8417 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8418 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8419 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8423 if (IndexArgTy !=
S64) {
8424 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(
S64, Index)
8425 :
B.buildAnyExt(
S64, Index);
8426 MI.getOperand(5).setReg(NewIndex.getReg(0));
8430 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8431 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8432 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8433 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8434 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8435 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8436 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8437 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8441 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
8444 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8445 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8446 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8447 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8448 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8449 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8450 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8451 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8452 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8454 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8458 if (IndexArgTy != IdxTy) {
8459 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(IdxTy, Index)
8460 :
B.buildAnyExt(IdxTy, Index);
8461 MI.getOperand(7).setReg(NewIndex.getReg(0));
8466 case Intrinsic::amdgcn_fmed3: {
8472 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8473 MI.removeOperand(1);
8477 case Intrinsic::amdgcn_readlane:
8478 case Intrinsic::amdgcn_writelane:
8479 case Intrinsic::amdgcn_readfirstlane:
8480 case Intrinsic::amdgcn_permlane16:
8481 case Intrinsic::amdgcn_permlanex16:
8482 case Intrinsic::amdgcn_permlane64:
8483 case Intrinsic::amdgcn_set_inactive:
8484 case Intrinsic::amdgcn_set_inactive_chain_arg:
8485 case Intrinsic::amdgcn_mov_dpp8:
8486 case Intrinsic::amdgcn_update_dpp:
8487 case Intrinsic::amdgcn_permlane_bcast:
8488 case Intrinsic::amdgcn_permlane_up:
8489 case Intrinsic::amdgcn_permlane_down:
8490 case Intrinsic::amdgcn_permlane_xor:
8492 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8494 case Intrinsic::amdgcn_dead: {
8498 MI.eraseFromParent();
8501 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8502 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8503 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8504 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8505 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8506 MI.eraseFromParent();
8508 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8509 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8510 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8511 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8512 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8513 MI.eraseFromParent();
8515 case Intrinsic::amdgcn_av_load_b128:
8516 case Intrinsic::amdgcn_av_store_b128: {
8518 if (!ST.hasFlatGlobalInsts()) {
8519 const char *Name = IntrID == Intrinsic::amdgcn_av_load_b128
8520 ?
"llvm.amdgcn.av.load.b128"
8521 :
"llvm.amdgcn.av.store.b128";
8524 Fn,
Twine(Name) +
" not supported on subtarget",
MI.getDebugLoc()));
8527 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8528 if (IntrID == Intrinsic::amdgcn_av_load_b128)
8529 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8531 B.buildStore(
MI.getOperand(2),
MI.getOperand(1),
8532 **
MI.memoperands_begin());
8533 MI.eraseFromParent();
8536 case Intrinsic::amdgcn_flat_load_monitor_b32:
8537 case Intrinsic::amdgcn_flat_load_monitor_b64:
8538 case Intrinsic::amdgcn_flat_load_monitor_b128:
8539 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8540 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8541 .add(
MI.getOperand(0))
8542 .add(
MI.getOperand(2))
8543 .addMemOperand(*
MI.memoperands_begin());
8544 MI.eraseFromParent();
8546 case Intrinsic::amdgcn_global_load_monitor_b32:
8547 case Intrinsic::amdgcn_global_load_monitor_b64:
8548 case Intrinsic::amdgcn_global_load_monitor_b128:
8549 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8550 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8551 .add(
MI.getOperand(0))
8552 .add(
MI.getOperand(2))
8553 .addMemOperand(*
MI.memoperands_begin());
8554 MI.eraseFromParent();
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
bool isModuleEntryFunction() const
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isBottomOfStack() const
bool isEntryFunction() const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
Get the array size.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
ConstantFP - Floating Point Values [float, double].
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr bool isAnyScalar() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty if condition is met.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.