34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
37#define DEBUG_TYPE "amdgpu-legalinfo"
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
47 "amdgpu-global-isel-new-legality",
48 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
74 const LLT Ty = Query.Types[TypeIdx];
81 EltSize > 1 && EltSize < 32 &&
88 const LLT Ty = Query.Types[TypeIdx];
95 const LLT Ty = Query.Types[TypeIdx];
103 const LLT Ty = Query.Types[TypeIdx];
105 return std::pair(TypeIdx,
112 const LLT Ty = Query.Types[TypeIdx];
115 unsigned Pieces = (
Size + 63) / 64;
126 const LLT Ty = Query.Types[TypeIdx];
131 const int NextMul32 = (
Size + 31) / 32;
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
143 const LLT Ty = Query.Types[TypeIdx];
148 assert(EltSize == 32 || EltSize == 64);
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
157 return std::pair(TypeIdx,
190 const LLT Ty = Query.Types[TypeIdx];
197 const LLT Ty = Query.Types[TypeIdx];
207 const LLT QueryTy = Query.Types[TypeIdx];
214 const LLT QueryTy = Query.Types[TypeIdx];
221 const LLT QueryTy = Query.Types[TypeIdx];
232 return EltSize == 16 || EltSize % 32 == 0;
237 return EltSize == 32 || EltSize == 64 ||
239 EltSize == 128 || EltSize == 256;
266 LLT Ty = Query.Types[TypeIdx];
274 const LLT QueryTy = Query.Types[TypeIdx];
369 const LLT Ty = Query.Types[TypeIdx];
371 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.
getSizeInBits();
379 bool IsLoad,
bool IsAtomic) {
383 return ST.enableFlatScratch() ? 128 : 32;
385 return ST.useDS128() ? 128 : 64;
396 return IsLoad ? 512 : 128;
401 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
410 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
415 unsigned AS = Query.
Types[1].getAddressSpace();
429 if (IsLoad && MemSize <
Size)
430 MemSize = std::max(MemSize,
Align);
439 AtomicOrdering::NotAtomic))
450 if (!ST.hasDwordx3LoadStores())
463 if (AlignBits < MemSize) {
466 Align(AlignBits / 8)))
508 return EltSize != 32 && EltSize != 64;
523 if (
Size != MemSizeInBits)
539 uint64_t AlignInBits,
unsigned AddrSpace,
549 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
560 if (AlignInBits < RoundedSize)
567 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
574 if (Query.
MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
579 Query.
Types[1].getAddressSpace(), Opcode);
599 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
602 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
603 std::array<Register, 4> VectorElems;
604 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
605 for (
unsigned I = 0;
I < NumParts; ++
I)
607 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
608 B.buildMergeValues(MO, VectorElems);
612 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
613 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
614 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
615 B.buildIntToPtr(MO, Scalar);
635 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
636 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
637 for (
unsigned I = 0;
I < NumParts; ++
I)
639 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
641 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
642 return B.buildBitcast(VectorTy, Scalar).getReg(0);
659 using namespace TargetOpcode;
661 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
674 const LLT BufferStridedPtr =
677 const LLT CodePtr = FlatPtr;
679 const std::initializer_list<LLT> AddrSpaces64 = {
680 GlobalPtr, ConstantPtr, FlatPtr
683 const std::initializer_list<LLT> AddrSpaces32 = {
684 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
687 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
689 const std::initializer_list<LLT> FPTypesBase = {
693 const std::initializer_list<LLT> FPTypes16 = {
697 const std::initializer_list<LLT> FPTypesPK16 = {
727 .clampMaxNumElementsStrict(0,
S16, 2)
735 .clampMaxNumElementsStrict(0,
S16, 2)
745 .clampMaxNumElementsStrict(0,
S16, 2)
753 .clampMaxNumElementsStrict(0,
S16, 2)
763 .minScalarOrElt(0,
S16)
780 .widenScalarToNextMultipleOf(0, 32)
802 .widenScalarToNextMultipleOf(0, 32)
810 .widenScalarToNextMultipleOf(0, 32);
821 .minScalarOrElt(0,
S32)
840 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
852 .clampMaxNumElements(0,
S8, 2)
871 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
883 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
890 .clampScalar(0,
S16,
S64);
923 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
924 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
947 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
981 .legalFor(FPTypesPK16)
995 .clampScalar(0,
S16,
S64);
1001 .clampScalar(1,
S32,
S32)
1020 .clampScalar(0,
S32,
S64);
1025 .clampScalar(0,
S32,
S64);
1031 .clampScalar(0,
S32,
S64)
1032 .clampScalar(1,
S32,
S32)
1039 .clampScalar(1,
S32,
S32)
1045 FPTruncActions.legalFor(
1049 FPTruncActions.scalarize(0).lower();
1078 FMad.customFor({
S32,
S16});
1080 FMad.customFor({
S32});
1082 FMad.customFor({
S16});
1090 FRem.minScalar(0,
S32)
1099 .clampMaxNumElements(0,
S16, 2)
1110 .clampScalar(0,
S32,
S64)
1111 .widenScalarToNextPow2(1, 32);
1139 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1140 .clampScalar(0,
S16,
S64)
1144 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1150 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1154 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1155 .clampScalar(0,
S16,
S64)
1159 if (
ST.has16BitInsts()) {
1160 getActionDefinitionsBuilder(
1161 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1163 .clampScalar(0,
S16,
S64)
1166 getActionDefinitionsBuilder(
1167 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1169 .clampScalar(0,
S32,
S64)
1172 getActionDefinitionsBuilder(
1173 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1176 .clampScalar(0,
S32,
S64)
1180 getActionDefinitionsBuilder(G_PTR_ADD)
1181 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1184 .scalarSameSizeAs(1, 0);
1186 getActionDefinitionsBuilder(G_PTRMASK)
1188 .scalarSameSizeAs(1, 0)
1192 getActionDefinitionsBuilder(G_ICMP)
1203 .legalForCartesianProduct(
1204 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1205 .legalForCartesianProduct(
1206 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1207 if (
ST.has16BitInsts()) {
1208 CmpBuilder.legalFor({{
S1,
S16}});
1212 .widenScalarToNextPow2(1)
1213 .clampScalar(1,
S32,
S64)
1218 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1219 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1221 if (
ST.hasSALUFloatInsts())
1222 FCmpBuilder.legalForCartesianProduct({
S32}, {
S16,
S32});
1225 .widenScalarToNextPow2(1)
1226 .clampScalar(1,
S32,
S64)
1230 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1231 if (
ST.has16BitInsts())
1232 ExpOps.customFor({{
S32}, {
S16}});
1234 ExpOps.customFor({
S32});
1235 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1238 getActionDefinitionsBuilder(G_FPOWI)
1239 .clampScalar(0, MinScalarFPTy,
S32)
1242 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1243 Log2Ops.customFor({
S32});
1244 if (
ST.has16BitInsts())
1245 Log2Ops.legalFor({
S16});
1247 Log2Ops.customFor({
S16});
1248 Log2Ops.scalarize(0)
1252 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1253 LogOps.customFor({
S32,
S16});
1254 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1258 getActionDefinitionsBuilder(G_CTPOP)
1260 .clampScalar(0,
S32,
S32)
1261 .widenScalarToNextPow2(1, 32)
1262 .clampScalar(1,
S32,
S64)
1264 .widenScalarToNextPow2(0, 32);
1267 if (
ST.has16BitInsts())
1268 getActionDefinitionsBuilder(G_IS_FPCLASS)
1269 .legalForCartesianProduct({
S1}, FPTypes16)
1270 .widenScalarToNextPow2(1)
1274 getActionDefinitionsBuilder(G_IS_FPCLASS)
1275 .legalForCartesianProduct({
S1}, FPTypesBase)
1276 .lowerFor({
S1,
S16})
1277 .widenScalarToNextPow2(1)
1284 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1286 .clampScalar(0,
S32,
S32)
1287 .clampScalar(1,
S32,
S64)
1288 .widenScalarToNextPow2(0, 32)
1289 .widenScalarToNextPow2(1, 32)
1293 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1296 .clampScalar(0,
S32,
S32)
1297 .clampScalar(1,
S32,
S64)
1299 .widenScalarToNextPow2(0, 32)
1300 .widenScalarToNextPow2(1, 32);
1302 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1304 .clampScalar(0,
S32,
S32)
1305 .clampScalar(1,
S32,
S64)
1307 .widenScalarToNextPow2(0, 32)
1308 .widenScalarToNextPow2(1, 32);
1312 getActionDefinitionsBuilder(G_BITREVERSE)
1314 .clampScalar(0,
S32,
S64)
1316 .widenScalarToNextPow2(0);
1318 if (
ST.has16BitInsts()) {
1319 getActionDefinitionsBuilder(G_BSWAP)
1321 .clampMaxNumElementsStrict(0,
S16, 2)
1324 .widenScalarToNextPow2(0)
1325 .clampScalar(0,
S16,
S32)
1328 if (
ST.hasVOP3PInsts()) {
1329 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1331 .clampMaxNumElements(0,
S16, 2)
1333 .widenScalarToNextPow2(0)
1337 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1339 .widenScalarToNextPow2(0)
1346 getActionDefinitionsBuilder(G_BSWAP)
1351 .widenScalarToNextPow2(0)
1356 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1359 .widenScalarToNextPow2(0)
1364 getActionDefinitionsBuilder(G_INTTOPTR)
1366 .legalForCartesianProduct(AddrSpaces64, {
S64})
1367 .legalForCartesianProduct(AddrSpaces32, {
S32})
1380 getActionDefinitionsBuilder(G_PTRTOINT)
1382 .legalForCartesianProduct(AddrSpaces64, {
S64})
1383 .legalForCartesianProduct(AddrSpaces32, {
S32})
1396 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1400 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1401 bool IsLoad) ->
bool {
1405 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1419 unsigned NumRegs = (MemSize + 31) / 32;
1421 if (!
ST.hasDwordx3LoadStores())
1432 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1433 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1434 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1440 for (
unsigned Op : {G_LOAD, G_STORE}) {
1441 const bool IsStore =
Op == G_STORE;
1443 auto &Actions = getActionDefinitionsBuilder(
Op);
1446 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1449 {
S64, GlobalPtr,
S64, GlobalAlign32},
1452 {
S32, GlobalPtr,
S8, GlobalAlign8},
1453 {
S32, GlobalPtr,
S16, GlobalAlign16},
1455 {
S32, LocalPtr,
S32, 32},
1456 {
S64, LocalPtr,
S64, 32},
1458 {
S32, LocalPtr,
S8, 8},
1459 {
S32, LocalPtr,
S16, 16},
1462 {
S32, PrivatePtr,
S32, 32},
1463 {
S32, PrivatePtr,
S8, 8},
1464 {
S32, PrivatePtr,
S16, 16},
1467 {
S32, ConstantPtr,
S32, GlobalAlign32},
1470 {
S64, ConstantPtr,
S64, GlobalAlign32},
1471 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1480 Actions.unsupportedIf(
1481 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1495 Actions.customIf(
typeIs(1, Constant32Ptr));
1521 return !Query.
Types[0].isVector() &&
1522 needToSplitMemOp(Query,
Op == G_LOAD);
1524 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1529 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1532 if (DstSize > MemSize)
1538 if (MemSize > MaxSize)
1546 return Query.
Types[0].isVector() &&
1547 needToSplitMemOp(Query,
Op == G_LOAD);
1549 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1563 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1564 if (MemSize > MaxSize) {
1568 if (MaxSize % EltSize == 0) {
1574 unsigned NumPieces = MemSize / MaxSize;
1578 if (NumPieces == 1 || NumPieces >= NumElts ||
1579 NumElts % NumPieces != 0)
1580 return std::pair(0, EltTy);
1588 return std::pair(0, EltTy);
1603 return std::pair(0, EltTy);
1607 .widenScalarToNextPow2(0)
1613 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1614 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1615 {
S32, GlobalPtr,
S16, 2 * 8},
1616 {
S32, LocalPtr,
S8, 8},
1617 {
S32, LocalPtr,
S16, 16},
1618 {
S32, PrivatePtr,
S8, 8},
1619 {
S32, PrivatePtr,
S16, 16},
1620 {
S32, ConstantPtr,
S8, 8},
1621 {
S32, ConstantPtr,
S16, 2 * 8}})
1627 if (
ST.hasFlatAddressSpace()) {
1628 ExtLoads.legalForTypesWithMemDesc(
1629 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1637 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1639 ExtLoads.clampScalar(0,
S32,
S32)
1640 .widenScalarToNextPow2(0)
1643 auto &Atomics = getActionDefinitionsBuilder(
1644 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1645 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1646 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1647 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1648 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1649 {
S64, GlobalPtr}, {
S64, LocalPtr},
1650 {
S32, RegionPtr}, {
S64, RegionPtr}});
1651 if (
ST.hasFlatAddressSpace()) {
1652 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1656 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1657 if (
ST.hasLDSFPAtomicAddF32()) {
1658 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1659 if (
ST.hasLdsAtomicAddF64())
1660 Atomic.legalFor({{
S64, LocalPtr}});
1661 if (
ST.hasAtomicDsPkAdd16Insts())
1662 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1664 if (
ST.hasAtomicFaddInsts())
1665 Atomic.legalFor({{
S32, GlobalPtr}});
1666 if (
ST.hasFlatAtomicFaddF32Inst())
1667 Atomic.legalFor({{
S32, FlatPtr}});
1669 if (
ST.hasGFX90AInsts()) {
1680 if (
ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1681 ST.hasAtomicBufferGlobalPkAddF16Insts())
1682 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1683 if (
ST.hasAtomicGlobalPkAddBF16Inst())
1684 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1685 if (
ST.hasAtomicFlatPkAdd16Insts())
1686 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1691 auto &AtomicFMinFMax =
1692 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1693 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1695 if (
ST.hasAtomicFMinFMaxF32GlobalInsts())
1696 AtomicFMinFMax.legalFor({{
F32, GlobalPtr},{
F32, BufferFatPtr}});
1697 if (
ST.hasAtomicFMinFMaxF64GlobalInsts())
1698 AtomicFMinFMax.legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1699 if (
ST.hasAtomicFMinFMaxF32FlatInsts())
1700 AtomicFMinFMax.legalFor({
F32, FlatPtr});
1701 if (
ST.hasAtomicFMinFMaxF64FlatInsts())
1702 AtomicFMinFMax.legalFor({
F64, FlatPtr});
1706 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1707 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1708 {
S32, FlatPtr}, {
S64, FlatPtr}})
1709 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1710 {
S32, RegionPtr}, {
S64, RegionPtr}});
1714 getActionDefinitionsBuilder(G_SELECT)
1716 LocalPtr, FlatPtr, PrivatePtr,
1720 .clampScalar(0,
S16,
S64)
1724 .clampMaxNumElements(0,
S32, 2)
1725 .clampMaxNumElements(0, LocalPtr, 2)
1726 .clampMaxNumElements(0, PrivatePtr, 2)
1728 .widenScalarToNextPow2(0)
1733 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1735 if (
ST.has16BitInsts()) {
1736 if (
ST.hasVOP3PInsts()) {
1738 .clampMaxNumElements(0,
S16, 2);
1740 Shifts.legalFor({{
S16,
S16}});
1743 Shifts.widenScalarIf(
1748 const LLT AmountTy = Query.
Types[1];
1753 Shifts.clampScalar(1,
S32,
S32);
1754 Shifts.widenScalarToNextPow2(0, 16);
1755 Shifts.clampScalar(0,
S16,
S64);
1757 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1765 Shifts.clampScalar(1,
S32,
S32);
1766 Shifts.widenScalarToNextPow2(0, 32);
1767 Shifts.clampScalar(0,
S32,
S64);
1769 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1774 Shifts.scalarize(0);
1776 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1777 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1778 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1779 unsigned IdxTypeIdx = 2;
1781 getActionDefinitionsBuilder(
Op)
1783 const LLT EltTy = Query.
Types[EltTypeIdx];
1784 const LLT VecTy = Query.
Types[VecTypeIdx];
1785 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1787 const bool isLegalVecType =
1797 return (EltSize == 32 || EltSize == 64) &&
1812 const LLT EltTy = Query.
Types[EltTypeIdx];
1813 const LLT VecTy = Query.
Types[VecTypeIdx];
1817 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1822 .clampScalar(EltTypeIdx,
S32,
S64)
1823 .clampScalar(VecTypeIdx,
S32,
S64)
1824 .clampScalar(IdxTypeIdx,
S32,
S32)
1825 .clampMaxNumElements(VecTypeIdx,
S32, 32)
1835 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1837 const LLT &EltTy = Query.
Types[1].getElementType();
1838 return Query.
Types[0] != EltTy;
1841 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1842 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1843 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1846 getActionDefinitionsBuilder(
Op)
1852 const LLT BigTy = Query.
Types[BigTyIdx];
1857 const LLT BigTy = Query.
Types[BigTyIdx];
1858 const LLT LitTy = Query.
Types[LitTyIdx];
1864 const LLT BigTy = Query.
Types[BigTyIdx];
1870 const LLT LitTy = Query.
Types[LitTyIdx];
1875 .widenScalarToNextPow2(BigTyIdx, 32);
1879 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1889 if (
ST.hasScalarPackInsts()) {
1892 .minScalarOrElt(0,
S16)
1895 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1899 BuildVector.customFor({
V2S16,
S16});
1900 BuildVector.minScalarOrElt(0,
S32);
1902 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1910 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1912 .clampMaxNumElements(0,
S32, 32)
1913 .clampMaxNumElements(1,
S16, 2)
1914 .clampMaxNumElements(0,
S16, 64);
1916 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1919 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1920 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1921 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1923 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1924 const LLT Ty = Query.
Types[TypeIdx];
1935 auto &Builder = getActionDefinitionsBuilder(
Op)
1939 const LLT BigTy = Query.
Types[BigTyIdx];
1945 .widenScalarToNextPow2(LitTyIdx, 16)
1953 .clampScalar(LitTyIdx,
S32,
S512)
1954 .widenScalarToNextPow2(LitTyIdx, 32)
1957 [=](
const LegalityQuery &Query) {
return notValidElt(Query, LitTyIdx); },
1960 [=](
const LegalityQuery &Query) {
return notValidElt(Query, BigTyIdx); },
1964 if (
Op == G_MERGE_VALUES) {
1965 Builder.widenScalarIf(
1968 const LLT Ty = Query.
Types[LitTyIdx];
1974 Builder.widenScalarIf(
1976 const LLT Ty = Query.
Types[BigTyIdx];
1982 const LLT &Ty = Query.
Types[BigTyIdx];
1984 if (NewSizeInBits >= 256) {
1986 if (RoundedTo < NewSizeInBits)
1987 NewSizeInBits = RoundedTo;
1989 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
1998 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1999 .legalFor({{
S32}, {
S64}});
2001 if (
ST.hasVOP3PInsts()) {
2002 SextInReg.lowerFor({{
V2S16}})
2006 .clampMaxNumElementsStrict(0,
S16, 2);
2007 }
else if (
ST.has16BitInsts()) {
2008 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2012 SextInReg.lowerFor({{
S32}, {
S64}});
2017 .clampScalar(0,
S32,
S64)
2020 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2025 getActionDefinitionsBuilder(G_FSHR)
2028 .clampMaxNumElementsStrict(0,
S16, 2)
2032 if (
ST.hasVOP3PInsts()) {
2033 getActionDefinitionsBuilder(G_FSHL)
2035 .clampMaxNumElementsStrict(0,
S16, 2)
2039 getActionDefinitionsBuilder(G_FSHL)
2044 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2047 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
2049 getActionDefinitionsBuilder(G_FENCE)
2052 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2057 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2059 .clampScalar(1,
S32,
S32)
2060 .clampScalar(0,
S32,
S64)
2061 .widenScalarToNextPow2(0)
2064 getActionDefinitionsBuilder(
2068 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2069 G_READ_REGISTER, G_WRITE_REGISTER,
2074 if (
ST.hasIEEEMinMax()) {
2075 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2076 .legalFor(FPTypesPK16)
2077 .clampMaxNumElements(0,
S16, 2)
2081 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2084 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2087 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2089 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2090 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2091 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2094 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2096 getLegacyLegalizerInfo().computeTables();
2106 switch (
MI.getOpcode()) {
2107 case TargetOpcode::G_ADDRSPACE_CAST:
2109 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2111 case TargetOpcode::G_FCEIL:
2113 case TargetOpcode::G_FREM:
2115 case TargetOpcode::G_INTRINSIC_TRUNC:
2117 case TargetOpcode::G_SITOFP:
2119 case TargetOpcode::G_UITOFP:
2121 case TargetOpcode::G_FPTOSI:
2123 case TargetOpcode::G_FPTOUI:
2125 case TargetOpcode::G_FMINNUM:
2126 case TargetOpcode::G_FMAXNUM:
2127 case TargetOpcode::G_FMINNUM_IEEE:
2128 case TargetOpcode::G_FMAXNUM_IEEE:
2130 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2132 case TargetOpcode::G_INSERT_VECTOR_ELT:
2134 case TargetOpcode::G_FSIN:
2135 case TargetOpcode::G_FCOS:
2137 case TargetOpcode::G_GLOBAL_VALUE:
2139 case TargetOpcode::G_LOAD:
2140 case TargetOpcode::G_SEXTLOAD:
2141 case TargetOpcode::G_ZEXTLOAD:
2143 case TargetOpcode::G_STORE:
2145 case TargetOpcode::G_FMAD:
2147 case TargetOpcode::G_FDIV:
2149 case TargetOpcode::G_FFREXP:
2151 case TargetOpcode::G_FSQRT:
2153 case TargetOpcode::G_UDIV:
2154 case TargetOpcode::G_UREM:
2155 case TargetOpcode::G_UDIVREM:
2157 case TargetOpcode::G_SDIV:
2158 case TargetOpcode::G_SREM:
2159 case TargetOpcode::G_SDIVREM:
2161 case TargetOpcode::G_ATOMIC_CMPXCHG:
2163 case TargetOpcode::G_FLOG2:
2165 case TargetOpcode::G_FLOG:
2166 case TargetOpcode::G_FLOG10:
2168 case TargetOpcode::G_FEXP2:
2170 case TargetOpcode::G_FEXP:
2171 case TargetOpcode::G_FEXP10:
2173 case TargetOpcode::G_FPOW:
2175 case TargetOpcode::G_FFLOOR:
2177 case TargetOpcode::G_BUILD_VECTOR:
2178 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2180 case TargetOpcode::G_MUL:
2182 case TargetOpcode::G_CTLZ:
2183 case TargetOpcode::G_CTTZ:
2185 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2187 case TargetOpcode::G_STACKSAVE:
2189 case TargetOpcode::G_GET_FPENV:
2191 case TargetOpcode::G_SET_FPENV:
2193 case TargetOpcode::G_TRAP:
2195 case TargetOpcode::G_DEBUGTRAP:
2215 if (ST.hasApertureRegs()) {
2220 ? AMDGPU::SRC_SHARED_BASE
2221 : AMDGPU::SRC_PRIVATE_BASE;
2230 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2231 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {
Register(ApertureRegNo)});
2232 return B.buildUnmerge(
S32, Dst).getReg(1);
2237 Register LoadAddr =
MRI.createGenericVirtualRegister(
2247 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2249 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2263 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2266 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2269 Register QueuePtr =
MRI.createGenericVirtualRegister(
2285 B.buildPtrAdd(LoadAddr, QueuePtr,
2286 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2287 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2295 switch (Def->getOpcode()) {
2296 case AMDGPU::G_FRAME_INDEX:
2297 case AMDGPU::G_GLOBAL_VALUE:
2298 case AMDGPU::G_BLOCK_ADDR:
2300 case AMDGPU::G_CONSTANT: {
2301 const ConstantInt *CI = Def->getOperand(1).getCImm();
2302 return CI->
getSExtValue() != TM.getNullPointerValue(AddrSpace);
2318 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2319 (isa<GIntrinsic>(
MI) && cast<GIntrinsic>(
MI).getIntrinsicID() ==
2320 Intrinsic::amdgcn_addrspacecast_nonnull));
2324 Register Src = isa<GIntrinsic>(
MI) ?
MI.getOperand(2).getReg()
2325 :
MI.getOperand(1).getReg();
2326 LLT DstTy =
MRI.getType(Dst);
2327 LLT SrcTy =
MRI.getType(Src);
2338 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2339 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2350 B.buildExtract(Dst, Src, 0);
2351 MI.eraseFromParent();
2355 unsigned NullVal = TM.getNullPointerValue(DestAS);
2357 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2358 auto FlatNull =
B.buildConstant(SrcTy, 0);
2361 auto PtrLo32 =
B.buildExtract(DstTy, Src, 0);
2365 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2367 MI.eraseFromParent();
2374 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2381 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2385 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2391 castLocalOrPrivateToFlat(Dst);
2392 MI.eraseFromParent();
2396 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2398 auto SegmentNull =
B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2399 auto FlatNull =
B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2402 SegmentNull.getReg(0));
2404 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2406 MI.eraseFromParent();
2413 B.buildExtract(Dst, Src, 0);
2414 MI.eraseFromParent();
2422 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2423 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2424 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2425 MI.eraseFromParent();
2430 MF.
getFunction(),
"invalid addrspacecast",
B.getDebugLoc());
2433 Ctx.
diagnose(InvalidAddrSpaceCast);
2435 MI.eraseFromParent();
2443 LLT Ty =
MRI.getType(Src);
2449 auto C1 =
B.buildFConstant(Ty, C1Val);
2450 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2453 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2454 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2456 auto C2 =
B.buildFConstant(Ty, C2Val);
2457 auto Fabs =
B.buildFAbs(Ty, Src);
2460 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2461 MI.eraseFromParent();
2479 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2481 const auto Zero =
B.buildFConstant(
S64, 0.0);
2482 const auto One =
B.buildFConstant(
S64, 1.0);
2485 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2486 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2489 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2490 MI.eraseFromParent();
2498 Register Src0Reg =
MI.getOperand(1).getReg();
2499 Register Src1Reg =
MI.getOperand(2).getReg();
2500 auto Flags =
MI.getFlags();
2501 LLT Ty =
MRI.getType(DstReg);
2503 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2504 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2505 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2506 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2507 MI.eraseFromParent();
2513 const unsigned FractBits = 52;
2514 const unsigned ExpBits = 11;
2517 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2518 auto Const1 =
B.buildConstant(
S32, ExpBits);
2520 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2522 .addUse(Const0.getReg(0))
2523 .addUse(Const1.getReg(0));
2525 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2539 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2546 const unsigned FractBits = 52;
2549 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2550 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2552 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2554 const auto Zero32 =
B.buildConstant(
S32, 0);
2557 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2559 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2560 auto Not =
B.buildNot(
S64, Shr);
2561 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2562 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2567 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2568 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2569 MI.eraseFromParent();
2585 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2586 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2588 if (
MRI.getType(Dst) ==
S64) {
2589 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2590 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2592 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2593 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2596 B.buildFAdd(Dst, LdExp, CvtLo);
2597 MI.eraseFromParent();
2603 auto One =
B.buildConstant(
S32, 1);
2607 auto ThirtyOne =
B.buildConstant(
S32, 31);
2608 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2609 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2610 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2611 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2612 .addUse(Unmerge.getReg(1));
2613 auto LS2 =
B.buildSub(
S32, LS, One);
2614 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2616 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2617 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2618 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2619 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2620 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2621 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2622 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2623 B.buildFLdexp(Dst, FVal, Scale);
2624 MI.eraseFromParent();
2641 const LLT SrcLT =
MRI.getType(Src);
2644 unsigned Flags =
MI.getFlags();
2655 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2663 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2664 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2668 K0 =
B.buildFConstant(
2669 S64, llvm::bit_cast<double>(UINT64_C( 0x3df0000000000000)));
2670 K1 =
B.buildFConstant(
2671 S64, llvm::bit_cast<double>(UINT64_C( 0xc1f0000000000000)));
2673 K0 =
B.buildFConstant(
2674 S32, llvm::bit_cast<float>(UINT32_C( 0x2f800000)));
2675 K1 =
B.buildFConstant(
2676 S32, llvm::bit_cast<float>(UINT32_C( 0xcf800000)));
2679 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2680 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2681 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2684 :
B.buildFPTOUI(
S32, FloorMul);
2685 auto Lo =
B.buildFPTOUI(
S32, Fma);
2689 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2691 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2694 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2695 MI.eraseFromParent();
2705 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2706 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2729 LLT VecTy =
MRI.getType(Vec);
2742 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2743 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2744 B.buildIntToPtr(Dst, IntElt);
2746 MI.eraseFromParent();
2753 std::optional<ValueAndVReg> MaybeIdxVal =
2757 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2760 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2761 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2766 MI.eraseFromParent();
2781 LLT VecTy =
MRI.getType(Vec);
2795 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2796 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2797 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2799 B.buildIntToPtr(Dst, IntVecDest);
2800 MI.eraseFromParent();
2807 std::optional<ValueAndVReg> MaybeIdxVal =
2812 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2815 if (IdxVal < NumElts) {
2817 for (
unsigned i = 0; i < NumElts; ++i)
2818 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2819 B.buildUnmerge(SrcRegs, Vec);
2821 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2822 B.buildMergeLikeInstr(Dst, SrcRegs);
2827 MI.eraseFromParent();
2837 LLT Ty =
MRI.getType(DstReg);
2838 unsigned Flags =
MI.getFlags();
2843 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2844 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2845 .addUse(MulVal.getReg(0))
2849 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2852 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2856 MI.eraseFromParent();
2864 unsigned GAFlags)
const {
2865 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
2893 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2904 if (!
B.getMRI()->getRegClassOrNull(PCReg))
2905 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2908 B.buildExtract(DstReg, PCReg, 0);
2922 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
2924 :
MRI.createGenericVirtualRegister(
S32);
2926 if (!
MRI.getRegClassOrNull(AddrLo))
2927 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2930 B.buildInstr(AMDGPU::S_MOV_B32)
2935 if (RequiresHighHalf) {
2937 "Must provide a 64-bit pointer type!");
2940 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2942 B.buildInstr(AMDGPU::S_MOV_B32)
2952 if (!
MRI.getRegClassOrNull(AddrDst))
2953 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2955 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2959 if (AddrDst != DstReg)
2960 B.buildCast(DstReg, AddrDst);
2961 }
else if (AddrLo != DstReg) {
2964 B.buildCast(DstReg, AddrLo);
2972 LLT Ty =
MRI.getType(DstReg);
2981 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
2985 Fn,
"local memory global used by non-kernel function",
MI.getDebugLoc(),
2995 B.buildUndef(DstReg);
2996 MI.eraseFromParent();
3016 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3020 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3021 B.buildIntToPtr(DstReg, Sz);
3022 MI.eraseFromParent();
3028 *cast<GlobalVariable>(GV)));
3029 MI.eraseFromParent();
3035 MI.eraseFromParent();
3043 MI.eraseFromParent();
3049 MI.eraseFromParent();
3054 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
3067 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3068 B.buildExtract(DstReg, Load, 0);
3070 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3072 MI.eraseFromParent();
3090 LLT PtrTy =
MRI.getType(PtrReg);
3095 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3097 MI.getOperand(1).setReg(Cast.getReg(0));
3102 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3106 LLT ValTy =
MRI.getType(ValReg);
3128 if (WideMemSize == ValSize) {
3134 MI.setMemRefs(MF, {WideMMO});
3140 if (ValSize > WideMemSize)
3147 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3148 B.buildTrunc(ValReg, WideLoad).getReg(0);
3155 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3156 B.buildExtract(ValReg, WideLoad, 0);
3160 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3161 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3165 MI.eraseFromParent();
3178 Register DataReg =
MI.getOperand(0).getReg();
3179 LLT DataTy =
MRI.getType(DataReg);
3193 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3222 "this should not have been custom lowered");
3224 LLT ValTy =
MRI.getType(CmpVal);
3227 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3229 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3233 .setMemRefs(
MI.memoperands());
3235 MI.eraseFromParent();
3244 case TargetOpcode::G_INTRINSIC: {
3246 case Intrinsic::amdgcn_frexp_mant:
3254 case TargetOpcode::G_FFREXP: {
3259 case TargetOpcode::G_FPEXT: {
3283std::pair<Register, Register>
3285 unsigned Flags)
const {
3290 auto SmallestNormal =
B.buildFConstant(
3292 auto IsLtSmallestNormal =
3295 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3296 auto One =
B.buildFConstant(
F32, 1.0);
3298 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3299 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3301 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3314 LLT Ty =
B.getMRI()->getType(Dst);
3315 unsigned Flags =
MI.getFlags();
3320 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3321 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3322 .addUse(Ext.getReg(0))
3324 B.buildFPTrunc(Dst,
Log2, Flags);
3325 MI.eraseFromParent();
3333 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3336 MI.eraseFromParent();
3340 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3341 .addUse(ScaledInput)
3344 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3345 auto Zero =
B.buildFConstant(Ty, 0.0);
3347 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3348 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3350 MI.eraseFromParent();
3356 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3357 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3362 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3363 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3368 unsigned Flags =
MI.getFlags();
3369 const LLT Ty =
MRI.getType(
X);
3379 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3382 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3384 B.buildFPTrunc(Dst, LogVal);
3389 MI.eraseFromParent();
3398 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3403 const float c_log10 = 0x1.344134p-2f;
3404 const float cc_log10 = 0x1.09f79ep-26f;
3407 const float c_log = 0x1.62e42ep-1f;
3408 const float cc_log = 0x1.efa39ep-25f;
3410 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3411 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3413 R =
B.buildFMul(Ty,
Y,
C, Flags).getReg(0);
3414 auto NegR =
B.buildFNeg(Ty, R, Flags);
3415 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, Flags);
3416 auto FMA1 =
B.buildFMA(Ty,
Y,
CC, FMA0, Flags);
3417 R =
B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3420 const float ch_log10 = 0x1.344000p-2f;
3421 const float ct_log10 = 0x1.3509f6p-18f;
3424 const float ch_log = 0x1.62e000p-1f;
3425 const float ct_log = 0x1.0bfbe8p-15f;
3427 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3428 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3430 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3431 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3432 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3433 auto YTCT =
B.buildFMul(Ty, YT, CT, Flags);
3436 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3438 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, Flags);
3441 const bool IsFiniteOnly =
3445 if (!IsFiniteOnly) {
3448 auto Fabs =
B.buildFAbs(Ty,
Y);
3451 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3455 auto Zero =
B.buildFConstant(Ty, 0.0);
3457 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3458 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3459 B.buildFSub(Dst, R, Shift, Flags);
3461 B.buildCopy(Dst, R);
3464 MI.eraseFromParent();
3470 unsigned Flags)
const {
3471 const double Log2BaseInverted =
3474 LLT Ty =
B.getMRI()->getType(Dst);
3479 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3482 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3483 auto Zero =
B.buildFConstant(Ty, 0.0);
3485 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3486 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3489 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3491 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3492 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3500 ?
B.buildFLog2(Ty, Src, Flags)
3501 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3504 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3505 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3516 unsigned Flags =
MI.getFlags();
3517 LLT Ty =
B.getMRI()->getType(Dst);
3523 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3524 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3525 .addUse(Ext.getReg(0))
3527 B.buildFPTrunc(Dst,
Log2, Flags);
3528 MI.eraseFromParent();
3538 MI.eraseFromParent();
3546 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3548 RangeCheckConst, Flags);
3550 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3551 auto Zero =
B.buildFConstant(Ty, 0.0);
3552 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3553 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3555 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3556 .addUse(AddInput.getReg(0))
3559 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3560 auto One =
B.buildFConstant(Ty, 1.0);
3561 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3562 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3563 MI.eraseFromParent();
3569 LLT Ty =
B.getMRI()->getType(Dst);
3574 auto Mul =
B.buildFMul(Ty,
X, Log2E, Flags);
3578 .addUse(
Mul.getReg(0))
3581 B.buildFExp2(Dst,
Mul.getReg(0), Flags);
3587 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3590 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3591 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3592 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3595 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3597 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3598 .addUse(ExpInput.getReg(0))
3601 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3602 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3603 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3611 const unsigned Flags =
MI.getFlags();
3614 LLT Ty =
MRI.getType(Dst);
3617 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3624 MI.eraseFromParent();
3632 auto Ext =
B.buildFPExt(
F32,
X, Flags);
3635 B.buildFPTrunc(Dst, Lowered, Flags);
3636 MI.eraseFromParent();
3646 MI.eraseFromParent();
3674 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3679 const float cc_exp = 0x1.4ae0bep-26f;
3680 const float c_exp10 = 0x1.a934f0p+1f;
3681 const float cc_exp10 = 0x1.2f346ep-24f;
3683 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3684 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3685 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3686 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3688 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3689 PL =
B.buildFMA(Ty,
X,
CC, FMA0, Flags).getReg(0);
3691 const float ch_exp = 0x1.714000p+0f;
3692 const float cl_exp = 0x1.47652ap-12f;
3694 const float ch_exp10 = 0x1.a92000p+1f;
3695 const float cl_exp10 = 0x1.4f0978p-11f;
3697 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3698 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3699 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3701 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3702 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3704 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3705 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3708 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3709 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3712 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3715 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
3716 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3719 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3720 .addUse(
A.getReg(0))
3722 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3724 auto UnderflowCheckConst =
3725 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3726 auto Zero =
B.buildFConstant(Ty, 0.0);
3730 R =
B.buildSelect(Ty, Underflow, Zero, R);
3735 auto OverflowCheckConst =
3736 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3741 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3744 B.buildCopy(Dst, R);
3745 MI.eraseFromParent();
3754 unsigned Flags =
MI.getFlags();
3755 LLT Ty =
B.getMRI()->getType(Dst);
3760 auto Log =
B.buildFLog2(
F32, Src0, Flags);
3761 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3762 .addUse(Log.getReg(0))
3765 B.buildFExp2(Dst,
Mul, Flags);
3766 }
else if (Ty == F16) {
3768 auto Log =
B.buildFLog2(F16, Src0, Flags);
3769 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
3770 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
3771 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3772 .addUse(Ext0.getReg(0))
3773 .addUse(Ext1.getReg(0))
3775 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
3779 MI.eraseFromParent();
3787 ModSrc = SrcFNeg->getOperand(1).getReg();
3789 ModSrc = SrcFAbs->getOperand(1).getReg();
3791 ModSrc = SrcFAbs->getOperand(1).getReg();
3802 Register OrigSrc =
MI.getOperand(1).getReg();
3803 unsigned Flags =
MI.getFlags();
3805 "this should not have been custom lowered");
3815 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
3827 B.buildFConstant(
F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3835 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3837 B.buildFMinNum(Min, Fract, Const, Flags);
3842 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
3845 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
3846 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3848 MI.eraseFromParent();
3864 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3866 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
3867 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
3870 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
3871 B.buildBitcast(Dst,
Merge);
3873 MI.eraseFromParent();
3890 bool UsePartialMad64_32,
3891 bool SeparateOddAlignedProducts)
const {
3906 auto getZero32 = [&]() ->
Register {
3908 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
3911 auto getZero64 = [&]() ->
Register {
3913 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
3918 for (
unsigned i = 0; i < Src0.
size(); ++i) {
3929 if (CarryIn.empty())
3932 bool HaveCarryOut =
true;
3934 if (CarryIn.size() == 1) {
3936 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3940 CarryAccum = getZero32();
3942 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3943 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3945 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
3950 LocalAccum = getZero32();
3951 HaveCarryOut =
false;
3956 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
3957 LocalAccum =
Add.getReg(0);
3971 auto buildMadChain =
3974 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
3975 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
3982 if (LocalAccum.size() == 1 &&
3983 (!UsePartialMad64_32 || !CarryIn.empty())) {
3986 unsigned j1 = DstIndex - j0;
3987 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3991 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
3993 LocalAccum[0] =
Mul.getReg(0);
3995 if (CarryIn.empty()) {
3996 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
3999 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4005 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4009 if (j0 <= DstIndex) {
4010 bool HaveSmallAccum =
false;
4013 if (LocalAccum[0]) {
4014 if (LocalAccum.size() == 1) {
4015 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4016 HaveSmallAccum =
true;
4017 }
else if (LocalAccum[1]) {
4018 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4019 HaveSmallAccum =
false;
4021 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4022 HaveSmallAccum =
true;
4025 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4027 HaveSmallAccum =
true;
4031 unsigned j1 = DstIndex - j0;
4032 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4036 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4037 {Src0[j0], Src1[j1], Tmp});
4038 Tmp = Mad.getReg(0);
4039 if (!HaveSmallAccum)
4040 CarryOut.push_back(Mad.getReg(1));
4041 HaveSmallAccum =
false;
4044 }
while (j0 <= DstIndex);
4046 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4047 LocalAccum[0] = Unmerge.getReg(0);
4048 if (LocalAccum.size() > 1)
4049 LocalAccum[1] = Unmerge.getReg(1);
4076 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4077 Carry OddCarryIn = std::move(OddCarry);
4078 Carry EvenCarryIn = std::move(EvenCarry);
4083 if (2 * i < Accum.
size()) {
4084 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4085 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4090 if (!SeparateOddAlignedProducts) {
4091 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4092 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4094 bool IsHighest = 2 * i >= Accum.
size();
4098 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4104 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4106 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4108 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4111 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4114 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4115 Lo->getOperand(1).getReg());
4116 Accum[2 * i] =
Hi.getReg(0);
4117 SeparateOddCarry =
Hi.getReg(1);
4124 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4125 EvenCarryIn.push_back(CarryOut);
4127 if (2 * i < Accum.
size()) {
4128 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4129 OddCarry.push_back(CarryOut);
4142 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4151 LLT Ty =
MRI.getType(DstReg);
4155 unsigned NumParts =
Size / 32;
4171 for (
unsigned i = 0; i < NumParts; ++i) {
4175 B.buildUnmerge(Src0Parts, Src0);
4176 B.buildUnmerge(Src1Parts, Src1);
4179 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4180 SeparateOddAlignedProducts);
4182 B.buildMergeLikeInstr(DstReg, AccumRegs);
4183 MI.eraseFromParent();
4195 LLT DstTy =
MRI.getType(Dst);
4196 LLT SrcTy =
MRI.getType(Src);
4198 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4199 ? AMDGPU::G_AMDGPU_FFBH_U32
4200 : AMDGPU::G_AMDGPU_FFBL_B32;
4201 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4204 MI.eraseFromParent();
4213 LLT SrcTy =
MRI.getType(Src);
4218 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4219 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4220 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4221 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4222 B.buildTrunc(Dst, Ctlz);
4223 MI.eraseFromParent();
4229 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4232 return ConstVal && *ConstVal == -1;
4239 Register CondDef =
MI.getOperand(0).getReg();
4240 if (!
MRI.hasOneNonDBGUse(CondDef))
4248 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4254 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4263 if (Next == Parent->
end()) {
4267 UncondBrTarget = &*NextMBB;
4269 if (Next->getOpcode() != AMDGPU::G_BR)
4287 *ArgRC,
B.getDebugLoc(), ArgTy);
4291 const unsigned Mask = Arg->
getMask();
4292 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4299 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4300 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4303 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4305 B.buildCopy(DstReg, LiveIn);
4334 Arg = &WorkGroupIDX;
4335 ArgRC = &AMDGPU::SReg_32RegClass;
4339 Arg = &WorkGroupIDY;
4340 ArgRC = &AMDGPU::SReg_32RegClass;
4344 Arg = &WorkGroupIDZ;
4345 ArgRC = &AMDGPU::SReg_32RegClass;
4360 B.buildConstant(DstReg, 0);
4366 B.buildUndef(DstReg);
4370 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4381 MI.eraseFromParent();
4387 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4388 MI.eraseFromParent();
4409 B.buildUndef(DstReg);
4410 MI.eraseFromParent();
4414 if (Arg->isMasked()) {
4428 MI.eraseFromParent();
4435 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4445 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4453 Align Alignment)
const {
4457 "unexpected kernarg parameter type");
4461 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
4464 MI.eraseFromParent();
4472 LLT DstTy =
MRI.getType(Dst);
4499 auto FloatY =
B.buildUITOFP(
S32,
Y);
4500 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
4501 auto Scale =
B.buildFConstant(
S32, llvm::bit_cast<float>(0x4f7ffffe));
4502 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
4503 auto Z =
B.buildFPTOUI(
S32, ScaledY);
4506 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
4507 auto NegYZ =
B.buildMul(
S32, NegY, Z);
4508 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
4511 auto Q =
B.buildUMulH(
S32,
X, Z);
4512 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
4515 auto One =
B.buildConstant(
S32, 1);
4518 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
4524 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
4527 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
4546 auto Unmerge =
B.buildUnmerge(
S32, Val);
4548 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
4549 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
4551 auto Mad =
B.buildFMAD(
4553 B.buildFConstant(
S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4555 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
4556 auto Mul1 =
B.buildFMul(
4557 S32, Rcp,
B.buildFConstant(
S32, llvm::bit_cast<float>(0x5f7ffffc)));
4560 auto Mul2 =
B.buildFMul(
4561 S32, Mul1,
B.buildFConstant(
S32, llvm::bit_cast<float>(0x2f800000)));
4562 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
4565 auto Mad2 =
B.buildFMAD(
4566 S32, Trunc,
B.buildFConstant(
S32, llvm::bit_cast<float>(0xcf800000)),
4569 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
4570 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
4572 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4587 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
4589 auto Zero64 =
B.buildConstant(
S64, 0);
4590 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
4592 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
4593 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
4595 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
4596 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4597 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4599 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
4600 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4601 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
4603 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
4604 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
4605 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
4606 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4607 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4609 auto Zero32 =
B.buildConstant(
S32, 0);
4610 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
4611 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4612 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
4614 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
4615 Register NumerLo = UnmergeNumer.getReg(0);
4616 Register NumerHi = UnmergeNumer.getReg(1);
4618 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
4619 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
4620 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
4621 Register Mul3_Lo = UnmergeMul3.getReg(0);
4622 Register Mul3_Hi = UnmergeMul3.getReg(1);
4623 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
4624 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4625 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
4626 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
4628 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
4629 Register DenomLo = UnmergeDenom.getReg(0);
4630 Register DenomHi = UnmergeDenom.getReg(1);
4633 auto C1 =
B.buildSExt(
S32, CmpHi);
4636 auto C2 =
B.buildSExt(
S32, CmpLo);
4639 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
4646 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
4647 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4648 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4649 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
4651 auto One64 =
B.buildConstant(
S64, 1);
4652 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
4658 auto C6 =
B.buildSelect(
4662 auto Add4 =
B.buildAdd(
S64, Add3, One64);
4663 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
4665 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4666 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4667 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
4673 auto Sel1 =
B.buildSelect(
4680 auto Sel2 =
B.buildSelect(
4691 switch (
MI.getOpcode()) {
4694 case AMDGPU::G_UDIV: {
4695 DstDivReg =
MI.getOperand(0).getReg();
4698 case AMDGPU::G_UREM: {
4699 DstRemReg =
MI.getOperand(0).getReg();
4702 case AMDGPU::G_UDIVREM: {
4703 DstDivReg =
MI.getOperand(0).getReg();
4704 DstRemReg =
MI.getOperand(1).getReg();
4711 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4712 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
4713 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
4714 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4723 MI.eraseFromParent();
4733 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4734 if (Ty !=
S32 && Ty !=
S64)
4737 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4742 auto LHSign =
B.buildAShr(Ty,
LHS, SignBitOffset);
4743 auto RHSign =
B.buildAShr(Ty,
RHS, SignBitOffset);
4745 LHS =
B.buildAdd(Ty,
LHS, LHSign).getReg(0);
4746 RHS =
B.buildAdd(Ty,
RHS, RHSign).getReg(0);
4748 LHS =
B.buildXor(Ty,
LHS, LHSign).getReg(0);
4749 RHS =
B.buildXor(Ty,
RHS, RHSign).getReg(0);
4751 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4752 switch (
MI.getOpcode()) {
4755 case AMDGPU::G_SDIV: {
4756 DstDivReg =
MI.getOperand(0).getReg();
4757 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4760 case AMDGPU::G_SREM: {
4761 DstRemReg =
MI.getOperand(0).getReg();
4762 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4765 case AMDGPU::G_SDIVREM: {
4766 DstDivReg =
MI.getOperand(0).getReg();
4767 DstRemReg =
MI.getOperand(1).getReg();
4768 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4769 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4780 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
4781 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4782 B.buildSub(DstDivReg, SignXor, Sign);
4786 auto Sign = LHSign.getReg(0);
4787 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4788 B.buildSub(DstRemReg, SignXor, Sign);
4791 MI.eraseFromParent();
4802 LLT ResTy =
MRI.getType(Res);
4809 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
4820 if (CLHS->isExactlyValue(1.0)) {
4821 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4825 MI.eraseFromParent();
4830 if (CLHS->isExactlyValue(-1.0)) {
4831 auto FNeg =
B.buildFNeg(ResTy,
RHS, Flags);
4832 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4833 .addUse(FNeg.getReg(0))
4836 MI.eraseFromParent();
4843 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
4848 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4851 B.buildFMul(Res,
LHS, RCP, Flags);
4853 MI.eraseFromParent();
4864 LLT ResTy =
MRI.getType(Res);
4870 if (!AllowInaccurateRcp)
4873 auto NegY =
B.buildFNeg(ResTy,
Y);
4874 auto One =
B.buildFConstant(ResTy, 1.0);
4876 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4880 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
4881 R =
B.buildFMA(ResTy, Tmp0, R, R);
4883 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
4884 R =
B.buildFMA(ResTy, Tmp1, R, R);
4886 auto Ret =
B.buildFMul(ResTy,
X, R);
4887 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
4889 B.buildFMA(Res, Tmp2, R, Ret);
4890 MI.eraseFromParent();
4922 auto LHSExt =
B.buildFPExt(
S32,
LHS, Flags);
4923 auto RHSExt =
B.buildFPExt(
S32,
RHS, Flags);
4924 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
4925 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
4926 .addUse(RHSExt.getReg(0))
4928 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
4931 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
4932 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
4933 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
4935 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
4936 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
4937 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
4939 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
4940 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
4941 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
4942 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
4943 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4944 .addUse(RDst.getReg(0))
4949 MI.eraseFromParent();
4962 unsigned SPDenormMode =
4965 if (ST.hasDenormModeInst()) {
4967 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4969 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4970 B.buildInstr(AMDGPU::S_DENORM_MODE)
4971 .addImm(NewDenormModeValue);
4974 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4975 .addImm(SPDenormMode)
4997 auto One =
B.buildFConstant(
S32, 1.0f);
4999 auto DenominatorScaled =
5000 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5005 auto NumeratorScaled =
5006 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5012 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5013 .addUse(DenominatorScaled.getReg(0))
5015 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5018 const bool HasDynamicDenormals =
5023 if (!PreservesDenormals) {
5024 if (HasDynamicDenormals) {
5025 SavedSPDenormMode =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5026 B.buildInstr(AMDGPU::S_GETREG_B32)
5027 .addDef(SavedSPDenormMode)
5033 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5034 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5035 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5036 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5037 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5038 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5040 if (!PreservesDenormals) {
5041 if (HasDynamicDenormals) {
5042 assert(SavedSPDenormMode);
5043 B.buildInstr(AMDGPU::S_SETREG_B32)
5044 .addReg(SavedSPDenormMode)
5050 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5051 .addUse(Fma4.getReg(0))
5052 .addUse(Fma1.getReg(0))
5053 .addUse(Fma3.getReg(0))
5054 .addUse(NumeratorScaled.getReg(1))
5057 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5058 .addUse(Fmas.getReg(0))
5063 MI.eraseFromParent();
5082 auto One =
B.buildFConstant(
S64, 1.0);
5084 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5090 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5092 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5093 .addUse(DivScale0.getReg(0))
5096 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5097 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5098 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5100 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5106 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5107 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5108 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5117 auto NumUnmerge =
B.buildUnmerge(
S32,
LHS);
5118 auto DenUnmerge =
B.buildUnmerge(
S32,
RHS);
5119 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5120 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5123 Scale1Unmerge.getReg(1));
5125 Scale0Unmerge.getReg(1));
5126 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5128 Scale = DivScale1.getReg(1);
5131 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5132 .addUse(Fma4.getReg(0))
5133 .addUse(Fma3.getReg(0))
5134 .addUse(
Mul.getReg(0))
5138 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5139 .addUse(Fmas.getReg(0))
5144 MI.eraseFromParent();
5156 LLT Ty =
MRI.getType(Res0);
5159 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5162 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5167 auto Fabs =
B.buildFAbs(Ty, Val);
5171 auto Zero =
B.buildConstant(InstrExpTy, 0);
5172 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5173 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5176 B.buildCopy(Res0, Mant);
5177 B.buildSExtOrTrunc(Res1, Exp);
5179 MI.eraseFromParent();
5194 auto Abs =
B.buildFAbs(
S32,
RHS, Flags);
5197 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5198 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5199 auto C2 =
B.buildFConstant(
S32, 1.0f);
5202 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5204 auto Mul0 =
B.buildFMul(
S32,
RHS, Sel, Flags);
5206 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5207 .addUse(Mul0.getReg(0))
5210 auto Mul1 =
B.buildFMul(
S32,
LHS, RCP, Flags);
5212 B.buildFMul(Res, Sel, Mul1, Flags);
5214 MI.eraseFromParent();
5223 unsigned Flags =
MI.getFlags();
5226 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5227 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5228 .addUse(Ext.getReg(0))
5230 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5231 MI.eraseFromParent();
5241 const unsigned Flags =
MI.getFlags();
5250 MI.eraseFromParent();
5254 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5256 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5257 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5258 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5263 .addUse(SqrtX.getReg(0))
5266 auto NegOne =
B.buildConstant(I32, -1);
5267 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5269 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5270 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5272 auto PosOne =
B.buildConstant(I32, 1);
5273 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5275 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5276 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5278 auto Zero =
B.buildFConstant(
F32, 0.0f);
5282 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5286 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5289 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5290 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5292 auto Half =
B.buildFConstant(
F32, 0.5f);
5293 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5294 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5295 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5296 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5297 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5298 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5299 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5300 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5303 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5305 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5307 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5310 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5312 MI.eraseFromParent();
5344 assert(
MRI.getType(Dst) ==
F64 &&
"only expect to lower f64 sqrt");
5347 unsigned Flags =
MI.getFlags();
5349 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
5351 auto ZeroInt =
B.buildConstant(
S32, 0);
5355 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
5356 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
5357 auto SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags);
5360 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX.getReg(0));
5362 auto Half =
B.buildFConstant(
F64, 0.5);
5363 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
5364 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
5366 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
5367 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
5369 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
5370 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
5372 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
5373 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
5375 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
5377 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
5378 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
5380 auto SqrtRet =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
5383 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
5384 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
5385 SqrtRet =
B.buildFLdexp(
F64, SqrtRet, ScaleDown, Flags);
5394 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5396 MI.eraseFromParent();
5403 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5427 auto Flags =
MI.getFlags();
5429 LLT Ty =
MRI.getType(Dst);
5439 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5449 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5450 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5455 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5457 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5458 MI.eraseFromParent();
5470 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5471 IID == Intrinsic::amdgcn_permlanex16;
5472 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5473 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5477 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
5479 case Intrinsic::amdgcn_readfirstlane:
5480 case Intrinsic::amdgcn_permlane64:
5481 return LaneOp.getReg(0);
5482 case Intrinsic::amdgcn_readlane:
5483 case Intrinsic::amdgcn_set_inactive:
5484 case Intrinsic::amdgcn_set_inactive_chain_arg:
5485 return LaneOp.addUse(Src1).getReg(0);
5486 case Intrinsic::amdgcn_writelane:
5487 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5488 case Intrinsic::amdgcn_permlane16:
5489 case Intrinsic::amdgcn_permlanex16: {
5493 return LaneOp.addUse(Src1)
5500 case Intrinsic::amdgcn_mov_dpp8:
5501 return LaneOp.addImm(
MI.getOperand(3).getImm()).
getReg(0);
5502 case Intrinsic::amdgcn_update_dpp:
5503 return LaneOp.addUse(Src1)
5504 .addImm(
MI.getOperand(4).getImm())
5505 .addImm(
MI.getOperand(5).getImm())
5506 .addImm(
MI.getOperand(6).getImm())
5507 .addImm(
MI.getOperand(7).getImm())
5517 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5518 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5519 Src1 =
MI.getOperand(3).getReg();
5520 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5521 Src2 =
MI.getOperand(4).getReg();
5525 LLT Ty =
MRI.getType(DstReg);
5528 unsigned SplitSize = 32;
5529 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
5534 if (
Size == SplitSize) {
5540 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
5542 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5545 if (IID == Intrinsic::amdgcn_writelane)
5548 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
5549 B.buildTrunc(DstReg, LaneOpDst);
5550 MI.eraseFromParent();
5554 if (
Size % SplitSize != 0)
5561 if (EltSize == SplitSize) {
5562 PartialResTy = EltTy;
5563 }
else if (EltSize == 16 || EltSize == 32) {
5564 unsigned NElem = SplitSize / EltSize;
5571 unsigned NumParts =
Size / SplitSize;
5575 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5576 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
5578 if (IID == Intrinsic::amdgcn_writelane)
5579 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
5581 for (
unsigned i = 0; i < NumParts; ++i) {
5582 Src0 = Src0Parts.
getReg(i);
5584 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5585 Src1 = Src1Parts.
getReg(i);
5587 if (IID == Intrinsic::amdgcn_writelane)
5588 Src2 = Src2Parts.
getReg(i);
5590 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5593 B.buildMergeLikeInstr(DstReg, PartialRes);
5594 MI.eraseFromParent();
5604 LLT DstTy =
MRI.getType(DstReg);
5607 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
5613 B.buildPtrAdd(DstReg, KernargPtrReg,
B.buildConstant(IdxTy,
Offset).getReg(0));
5624 Register Pointer =
MI.getOperand(2).getReg();
5626 Register NumRecords =
MI.getOperand(4).getReg();
5631 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
5632 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
5633 Register LowHalf = Unmerge.getReg(0);
5634 Register HighHalf = Unmerge.getReg(1);
5636 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
5637 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
5640 std::optional<ValueAndVReg> StrideConst =
5642 if (!StrideConst || !StrideConst->Value.isZero()) {
5645 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5646 uint32_t ShiftedStrideVal = StrideVal << 16;
5647 ShiftedStride =
B.buildConstant(
S32, ShiftedStrideVal);
5649 auto ExtStride =
B.buildAnyExt(
S32, Stride);
5650 auto ShiftConst =
B.buildConstant(
S32, 16);
5651 ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
5653 NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
5656 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5657 MI.eraseFromParent();
5674 MI.eraseFromParent();
5682 std::optional<uint32_t> KnownSize =
5684 if (KnownSize.has_value())
5685 B.buildConstant(DstReg, *KnownSize);
5703 MI.eraseFromParent();
5710 unsigned AddrSpace)
const {
5712 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32),
MI.getOperand(2).getReg());
5716 MI.eraseFromParent();
5726std::pair<Register, unsigned>
5735 std::tie(BaseReg, ImmOffset) =
5739 if (
MRI.getType(BaseReg).isPointer())
5740 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
5750 unsigned Overflow = ImmOffset & ~MaxImm;
5751 ImmOffset -= Overflow;
5752 if ((int32_t)Overflow < 0) {
5753 Overflow += ImmOffset;
5757 if (Overflow != 0) {
5759 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
5761 auto OverflowVal =
B.buildConstant(
S32, Overflow);
5762 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
5767 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
5769 return std::pair(BaseReg, ImmOffset);
5776 bool ImageStore)
const {
5779 LLT StoreVT =
MRI.getType(Reg);
5783 auto Unmerge =
B.buildUnmerge(
S16, Reg);
5786 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5787 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
5798 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
5800 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
5807 auto Unmerge =
B.buildUnmerge(
S16, Reg);
5808 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5810 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
5818 auto Unmerge =
B.buildUnmerge(
S32, Reg);
5819 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5821 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
5838 bool IsFormat)
const {
5840 LLT Ty =
MRI->getType(VData);
5850 VData =
B.buildBitcast(Ty, VData).getReg(0);
5871 bool IsFormat)
const {
5876 LLT Ty =
MRI.getType(VData);
5878 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
5893 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5896 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
5900 VIndex =
MI.getOperand(3).getReg();
5903 VIndex =
B.buildConstant(
S32, 0).getReg(0);
5906 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
5907 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
5911 Format =
MI.getOperand(5 + OpOffset).getImm();
5915 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
5921 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5922 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5923 }
else if (IsFormat) {
5924 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5925 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5929 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5932 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5935 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5940 auto MIB =
B.buildInstr(Opc)
5951 MIB.addImm(AuxiliaryData)
5952 .addImm(HasVIndex ? -1 : 0)
5953 .addMemOperand(MMO);
5955 MI.eraseFromParent();
5961 unsigned ImmOffset,
unsigned Format,
5964 auto MIB =
B.buildInstr(Opc)
5975 MIB.addImm(AuxiliaryData)
5976 .addImm(HasVIndex ? -1 : 0)
5977 .addMemOperand(MMO);
5983 bool IsTyped)
const {
5997 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
5998 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6000 StatusDst =
MI.getOperand(1).getReg();
6005 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6008 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6011 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6014 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6017 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6020 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6021 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6025 Format =
MI.getOperand(5 + OpOffset).getImm();
6029 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6032 LLT Ty =
MRI.getType(Dst);
6039 Dst =
MI.getOperand(0).getReg();
6040 B.setInsertPt(
B.getMBB(),
MI);
6047 Dst =
MI.getOperand(0).getReg();
6048 B.setInsertPt(
B.getMBB(),
MI);
6052 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6063 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6064 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6065 }
else if (IsFormat) {
6069 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6071 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6072 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6077 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6078 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6081 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6082 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6085 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6086 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6093 unsigned NumLoadDWords = NumValueDWords + 1;
6095 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6096 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6097 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6099 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6100 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6101 B.buildTrunc(Dst, ExtDst);
6102 }
else if (NumValueDWords == 1) {
6103 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6106 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6107 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6109 B.buildUnmerge(LoadElts, LoadDstReg);
6111 B.buildMergeLikeInstr(Dst, LoadElts);
6115 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6116 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6117 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6118 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6119 B.buildTrunc(Dst, LoadDstReg);
6120 }
else if (Unpacked && IsD16 && Ty.
isVector()) {
6122 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6123 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6124 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6125 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6127 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6129 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6130 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6131 B.buildMergeLikeInstr(Dst, Repack);
6134 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6137 MI.eraseFromParent();
6143 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6144 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6145 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6146 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6147 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6148 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6149 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6150 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6151 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6152 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6153 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6154 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6155 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6156 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6157 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6158 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6159 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6160 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6161 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6162 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6163 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6164 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6165 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6166 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6167 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6168 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6169 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6170 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6171 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6172 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6173 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6174 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6175 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6176 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6177 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6178 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6179 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6180 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6181 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6182 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6183 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6184 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6185 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6186 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6187 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6188 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6189 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6190 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6191 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6192 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6193 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6194 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6195 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6196 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6197 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6198 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6199 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6200 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6201 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6202 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6203 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6204 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6205 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6206 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6207 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6208 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6209 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6210 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6211 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6212 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6213 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6214 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6215 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6216 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6217 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6218 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6219 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6220 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6221 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6222 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6223 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6224 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6225 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6234 const bool IsCmpSwap =
6235 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6236 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6237 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6238 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6249 CmpVal =
MI.getOperand(3).getReg();
6254 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6255 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6258 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6261 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6264 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6267 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6268 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6269 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6288 .addImm(AuxiliaryData)
6289 .addImm(HasVIndex ? -1 : 0)
6290 .addMemOperand(MMO);
6292 MI.eraseFromParent();
6302 bool IsA16,
bool IsG16) {
6305 auto EndIdx =
Intr->VAddrEnd;
6307 for (
unsigned I =
Intr->VAddrStart;
I < EndIdx;
I++) {
6314 if ((I < Intr->GradientStart) ||
6315 (
I >=
Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6316 (
I >=
Intr->CoordStart && !IsA16)) {
6317 if ((I < Intr->GradientStart) && IsA16 &&
6318 (
B.getMRI()->getType(AddrReg) ==
S16)) {
6319 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
6323 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6327 "Bias needs to be converted to 16 bit in A16 mode");
6329 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
6335 if (((
I + 1) >= EndIdx) ||
6336 ((
Intr->NumGradients / 2) % 2 == 1 &&
6337 (
I ==
static_cast<unsigned>(
Intr->GradientStart +
6338 (
Intr->NumGradients / 2) - 1) ||
6339 I ==
static_cast<unsigned>(
Intr->GradientStart +
6340 Intr->NumGradients - 1))) ||
6342 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
6344 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6349 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6360 int DimIdx,
int NumVAddrs) {
6364 for (
int I = 0;
I != NumVAddrs; ++
I) {
6366 if (
SrcOp.isReg()) {
6372 int NumAddrRegs = AddrRegs.
size();
6373 if (NumAddrRegs != 1) {
6376 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6379 for (
int I = 1;
I != NumVAddrs; ++
I) {
6382 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
6404 const unsigned NumDefs =
MI.getNumExplicitDefs();
6405 const unsigned ArgOffset = NumDefs + 1;
6406 bool IsTFE = NumDefs == 2;
6424 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6425 Ty =
MRI->getType(VData);
6428 const bool IsAtomicPacked16Bit =
6429 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6430 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6434 MRI->getType(
MI.getOperand(ArgOffset +
Intr->GradientStart).getReg());
6436 MRI->getType(
MI.getOperand(ArgOffset +
Intr->CoordStart).getReg());
6439 const bool IsA16 = AddrTy ==
S16;
6443 if (!BaseOpcode->
Atomic) {
6444 DMask =
MI.getOperand(ArgOffset +
Intr->DMaskIndex).getImm();
6447 }
else if (DMask != 0) {
6449 }
else if (!IsTFE && !BaseOpcode->
Store) {
6451 B.buildUndef(
MI.getOperand(0));
6452 MI.eraseFromParent();
6460 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6461 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6462 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6463 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6464 unsigned NewOpcode = LoadOpcode;
6465 if (BaseOpcode->
Store)
6466 NewOpcode = StoreOpcode;
6468 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6471 MI.setDesc(
B.getTII().get(NewOpcode));
6475 if (IsTFE && DMask == 0) {
6478 MI.getOperand(ArgOffset +
Intr->DMaskIndex).setImm(DMask);
6481 if (BaseOpcode->
Atomic) {
6483 LLT Ty =
MRI->getType(VData0);
6486 if (Ty.
isVector() && !IsAtomicPacked16Bit)
6493 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
6494 MI.getOperand(2).setReg(
Concat.getReg(0));
6495 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6499 unsigned CorrectedNumVAddrs =
Intr->NumVAddrs;
6508 if (IsA16 && !ST.
hasA16()) {
6516 if (IsA16 || IsG16) {
6526 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
6527 const bool UsePartialNSA =
6528 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
6530 if (UsePartialNSA) {
6534 auto Concat =
B.buildConcatVectors(
6535 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6536 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
6537 PackedRegs.
resize(NSAMaxSize);
6538 }
else if (!UseNSA && PackedRegs.
size() > 1) {
6540 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
6541 PackedRegs[0] =
Concat.getReg(0);
6545 const unsigned NumPacked = PackedRegs.
size();
6546 for (
unsigned I =
Intr->VAddrStart; I < Intr->VAddrEnd;
I++) {
6548 if (!
SrcOp.isReg()) {
6555 if (
I -
Intr->VAddrStart < NumPacked)
6556 SrcOp.setReg(PackedRegs[
I -
Intr->VAddrStart]);
6558 SrcOp.setReg(AMDGPU::NoRegister);
6577 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6578 const bool UsePartialNSA =
6579 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6581 if (UsePartialNSA) {
6583 ArgOffset +
Intr->VAddrStart + NSAMaxSize - 1,
6584 Intr->NumVAddrs - NSAMaxSize + 1);
6585 }
else if (!UseNSA &&
Intr->NumVAddrs > 1) {
6604 if (RepackedReg != VData) {
6605 MI.getOperand(1).setReg(RepackedReg);
6616 if (NumElts < DMaskLanes)
6619 if (NumElts > 4 || DMaskLanes > 4)
6629 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6630 const LLT AdjustedTy =
6653 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
6654 unsigned RoundedSize = 32 * RoundedElts;
6658 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
6663 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
6669 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
6673 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6674 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
6676 Register NewResultReg =
MRI->createGenericVirtualRegister(LoadResultTy);
6678 MI.getOperand(0).setReg(NewResultReg);
6686 Dst1Reg =
MI.getOperand(1).getReg();
6687 if (
MRI->getType(Dst1Reg) !=
S32)
6691 MI.removeOperand(1);
6695 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6704 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6706 if (ResultNumRegs == 1) {
6708 ResultRegs[0] = NewResultReg;
6711 for (
int I = 0;
I != NumDataRegs; ++
I)
6712 ResultRegs[
I] =
MRI->createGenericVirtualRegister(RegTy);
6713 B.buildUnmerge(ResultRegs, NewResultReg);
6718 ResultRegs.
resize(NumDataRegs);
6724 B.buildTrunc(DstReg, ResultRegs[0]);
6730 B.buildBitcast(DstReg, ResultRegs[0]);
6744 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
6747 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
6751 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
6754 Register Undef =
B.buildUndef(Ty).getReg(0);
6755 for (
int I = 0;
I != NumElts; ++
I)
6760 LLT ResTy =
MRI->getType(ResultRegs[0]);
6762 padWithUndef(ResTy, NumElts - ResultRegs.
size());
6763 B.buildBuildVector(DstReg, ResultRegs);
6774 if (ResultRegs.
size() == 1) {
6775 NewResultReg = ResultRegs[0];
6776 }
else if (ResultRegs.
size() == 2) {
6778 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
6784 if (
MRI->getType(DstReg).getNumElements() <
6785 MRI->getType(NewResultReg).getNumElements()) {
6786 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6788 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6793 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
6794 B.buildConcatVectors(DstReg, ResultRegs);
6803 Register OrigDst =
MI.getOperand(0).getReg();
6805 LLT Ty =
B.getMRI()->getType(OrigDst);
6811 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6812 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6815 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
6817 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6826 B.setInsertPt(
B.getMBB(),
MI);
6831 B.setInsertPt(
B.getMBB(),
MI);
6837 MI.setDesc(
B.getTII().get(Opc));
6838 MI.removeOperand(1);
6841 const unsigned MemSize = (
Size + 7) / 8;
6842 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
6849 MI.addMemOperand(MF, MMO);
6850 if (Dst != OrigDst) {
6851 MI.getOperand(0).setReg(Dst);
6852 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6853 B.buildTrunc(OrigDst, Dst);
6875 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
6876 MI.removeOperand(0);
6903 MI.eraseFromParent();
6913 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
6915 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6919 MI.eraseFromParent();
6928 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6937 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
6953 Register LoadAddr =
MRI.createGenericVirtualRegister(
6955 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6958 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
6959 B.buildCopy(SGPR01, Temp);
6960 B.buildInstr(AMDGPU::S_TRAP)
6963 MI.eraseFromParent();
6974 B.buildCopy(SGPR01, LiveIn);
6975 B.buildInstr(AMDGPU::S_TRAP)
6979 MI.eraseFromParent();
6991 MI.eraseFromParent();
6995 B.buildInstr(AMDGPU::S_TRAP)
6997 MI.eraseFromParent();
7009 "debugtrap handler not supported",
7011 LLVMContext &Ctx =
B.getMF().getFunction().getContext();
7015 B.buildInstr(AMDGPU::S_TRAP)
7019 MI.eraseFromParent();
7032 Register NodePtr =
MI.getOperand(2).getReg();
7033 Register RayExtent =
MI.getOperand(3).getReg();
7034 Register RayOrigin =
MI.getOperand(4).getReg();
7036 Register RayInvDir =
MI.getOperand(6).getReg();
7041 "intrinsic not supported on subtarget",
7043 B.getMF().getFunction().getContext().diagnose(BadIntrin);
7050 const bool IsA16 =
MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7051 const bool Is64 =
MRI.getType(NodePtr).getSizeInBits() == 64;
7052 const unsigned NumVDataDwords = 4;
7053 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7054 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7058 const unsigned BaseOpcodes[2][2] = {
7059 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7060 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7061 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7065 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7066 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7067 : AMDGPU::MIMGEncGfx10NSA,
7068 NumVDataDwords, NumVAddrDwords);
7072 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7073 : AMDGPU::MIMGEncGfx10Default,
7074 NumVDataDwords, NumVAddrDwords);
7079 if (UseNSA && IsGFX11Plus) {
7081 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7082 auto Merged =
B.buildMergeLikeInstr(
7083 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7089 packLanes(RayOrigin);
7092 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7093 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7094 auto MergedDir =
B.buildMergeLikeInstr(
7097 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7098 UnmergeRayDir.getReg(0)}))
7101 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7102 UnmergeRayDir.getReg(1)}))
7105 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7106 UnmergeRayDir.getReg(2)}))
7111 packLanes(RayInvDir);
7115 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7124 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7130 packLanes(RayOrigin);
7132 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7133 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7137 B.buildMergeLikeInstr(R1,
7138 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7139 B.buildMergeLikeInstr(
7140 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7141 B.buildMergeLikeInstr(
7142 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7148 packLanes(RayInvDir);
7155 Register MergedOps =
B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7160 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7169 .addImm(IsA16 ? 1 : 0)
7172 MI.eraseFromParent();
7181 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7182 MI.eraseFromParent();
7193 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7194 auto LSB =
B.buildConstant(
S32, 25);
7195 auto Width =
B.buildConstant(
S32, 5);
7196 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7197 MI.eraseFromParent();
7211 if (
MRI.getType(Src) !=
S64)
7215 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7219 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7222 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7223 MI.eraseFromParent();
7231 if (
MRI.getType(Src) !=
S64)
7234 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
7238 .addReg(Unmerge.getReg(0));
7242 .addReg(Unmerge.getReg(1));
7243 MI.eraseFromParent();
7253 auto IntrID = cast<GIntrinsic>(
MI).getIntrinsicID();
7255 case Intrinsic::amdgcn_if:
7256 case Intrinsic::amdgcn_else: {
7259 bool Negated =
false;
7271 std::swap(CondBrTarget, UncondBrTarget);
7273 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7274 if (IntrID == Intrinsic::amdgcn_if) {
7275 B.buildInstr(AMDGPU::SI_IF)
7278 .addMBB(UncondBrTarget);
7280 B.buildInstr(AMDGPU::SI_ELSE)
7283 .addMBB(UncondBrTarget);
7292 B.buildBr(*CondBrTarget);
7295 MRI.setRegClass(Def,
TRI->getWaveMaskRegClass());
7296 MRI.setRegClass(
Use,
TRI->getWaveMaskRegClass());
7297 MI.eraseFromParent();
7298 BrCond->eraseFromParent();
7304 case Intrinsic::amdgcn_loop: {
7307 bool Negated =
false;
7317 std::swap(CondBrTarget, UncondBrTarget);
7319 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7320 B.buildInstr(AMDGPU::SI_LOOP)
7322 .addMBB(UncondBrTarget);
7327 B.buildBr(*CondBrTarget);
7329 MI.eraseFromParent();
7330 BrCond->eraseFromParent();
7331 MRI.setRegClass(Reg,
TRI->getWaveMaskRegClass());
7337 case Intrinsic::amdgcn_addrspacecast_nonnull:
7339 case Intrinsic::amdgcn_make_buffer_rsrc:
7341 case Intrinsic::amdgcn_kernarg_segment_ptr:
7344 B.buildConstant(
MI.getOperand(0).getReg(), 0);
7345 MI.eraseFromParent();
7351 case Intrinsic::amdgcn_implicitarg_ptr:
7353 case Intrinsic::amdgcn_workitem_id_x:
7356 case Intrinsic::amdgcn_workitem_id_y:
7359 case Intrinsic::amdgcn_workitem_id_z:
7362 case Intrinsic::amdgcn_workgroup_id_x:
7365 case Intrinsic::amdgcn_workgroup_id_y:
7368 case Intrinsic::amdgcn_workgroup_id_z:
7371 case Intrinsic::amdgcn_wave_id:
7373 case Intrinsic::amdgcn_lds_kernel_id:
7376 case Intrinsic::amdgcn_dispatch_ptr:
7379 case Intrinsic::amdgcn_queue_ptr:
7382 case Intrinsic::amdgcn_implicit_buffer_ptr:
7385 case Intrinsic::amdgcn_dispatch_id:
7388 case Intrinsic::r600_read_ngroups_x:
7392 case Intrinsic::r600_read_ngroups_y:
7395 case Intrinsic::r600_read_ngroups_z:
7398 case Intrinsic::r600_read_local_size_x:
7401 case Intrinsic::r600_read_local_size_y:
7405 case Intrinsic::r600_read_local_size_z:
7407 case Intrinsic::r600_read_global_size_x:
7409 case Intrinsic::r600_read_global_size_y:
7411 case Intrinsic::r600_read_global_size_z:
7413 case Intrinsic::amdgcn_fdiv_fast:
7415 case Intrinsic::amdgcn_is_shared:
7417 case Intrinsic::amdgcn_is_private:
7419 case Intrinsic::amdgcn_wavefrontsize: {
7421 MI.eraseFromParent();
7424 case Intrinsic::amdgcn_s_buffer_load:
7426 case Intrinsic::amdgcn_raw_buffer_store:
7427 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7428 case Intrinsic::amdgcn_struct_buffer_store:
7429 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7431 case Intrinsic::amdgcn_raw_buffer_store_format:
7432 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7433 case Intrinsic::amdgcn_struct_buffer_store_format:
7434 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7436 case Intrinsic::amdgcn_raw_tbuffer_store:
7437 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7438 case Intrinsic::amdgcn_struct_tbuffer_store:
7439 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7441 case Intrinsic::amdgcn_raw_buffer_load:
7442 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7443 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7444 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7445 case Intrinsic::amdgcn_struct_buffer_load:
7446 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7447 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7448 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7450 case Intrinsic::amdgcn_raw_buffer_load_format:
7451 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7452 case Intrinsic::amdgcn_struct_buffer_load_format:
7453 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7455 case Intrinsic::amdgcn_raw_tbuffer_load:
7456 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7457 case Intrinsic::amdgcn_struct_tbuffer_load:
7458 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7460 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7461 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7462 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7463 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7464 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7465 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7466 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7467 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7468 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7469 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7470 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7471 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7472 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7473 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7474 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7475 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7476 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7477 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7478 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7479 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7480 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7481 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7482 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7483 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7484 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7485 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7486 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7487 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7488 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7489 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7490 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7491 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7492 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7493 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7494 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7495 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7496 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7497 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7498 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7500 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7501 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7502 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7503 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7504 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7505 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7506 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7507 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7508 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7509 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7510 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7511 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7512 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7513 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7514 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7515 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7516 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7517 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7518 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7519 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7520 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7521 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7522 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7523 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7525 case Intrinsic::amdgcn_rsq_clamp:
7527 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7529 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7530 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7531 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7532 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7533 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7534 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7535 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7536 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7539 if (
MRI.getType(Index) !=
S32)
7540 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
7543 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7544 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7545 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7548 if (
MRI.getType(Index) !=
S32)
7549 MI.getOperand(7).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
7552 case Intrinsic::amdgcn_fmed3: {
7558 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7559 MI.removeOperand(1);
7563 case Intrinsic::amdgcn_readlane:
7564 case Intrinsic::amdgcn_writelane:
7565 case Intrinsic::amdgcn_readfirstlane:
7566 case Intrinsic::amdgcn_permlane16:
7567 case Intrinsic::amdgcn_permlanex16:
7568 case Intrinsic::amdgcn_permlane64:
7569 case Intrinsic::amdgcn_set_inactive:
7570 case Intrinsic::amdgcn_set_inactive_chain_arg:
7571 case Intrinsic::amdgcn_mov_dpp8:
7572 case Intrinsic::amdgcn_update_dpp:
7574 case Intrinsic::amdgcn_s_buffer_prefetch_data:
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterType(LLT Ty)
static bool isRegisterVectorElementType(LLT EltTy)
static bool isRegisterSize(unsigned Size)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static std::initializer_list< LLT > AllS32Vectors
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static std::initializer_list< LLT > AllS16Vectors
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static bool isRegisterClassType(LLT Ty)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx)
static constexpr unsigned MaxRegisterSize
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static const LLT MaxScalar
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
static std::initializer_list< LLT > AllS64Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static std::initializer_list< LLT > AllScalarTypes
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const SmallVectorImpl< MachineOperand > & Cond
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static constexpr int Concat[]
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isEntryFunction() const
bool isModuleEntryFunction() const
bool hasMadMacF32Insts() const
bool hasCvtPkF16F32Inst() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasVOP3PInsts() const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This class represents an Operation in the Expression.
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasArchitectedSGPRs() const
bool hasPrivEnabledTrap2NopBug() const
const SIInstrInfo * getInstrInfo() const override
bool hasScalarSubwordLoads() const
bool supportsGetDoorbellID() const
TrapHandlerAbi getTrapHandlerAbi() const
bool hasGFX10_AEncoding() const
const SITargetLowering * getTargetLowering() const override
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasScalarSMulU64() const
bool hasNSAEncoding() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasScalarDwordx3Loads() const
Generation getGeneration() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasDPALU_DPP() const
bool hasAddNoCarry() const
bool hasPartialNSAEncoding() const
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
KnownBits getKnownBits(Register R)
Simple wrapper observer that takes several observers, and calls each one for each event.
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
constexpr bool isPointerVector() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr ElementCount getElementCount() const
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr bool isPointerOrPointerVector() const
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LegalizeResult lowerFMad(MachineInstr &MI)
GISelKnownBits * getKnownBits() const
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
MutableArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
The instances of the Type class are immutable: once they are created, they are never changed.
A Use represents the edge between a Value definition and its users.
StringRef getName() const
Return a constant reference to the value's name.
constexpr ScalarTy getFixedValue() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
int popcount(T Value) noexcept
Count the number of set bits in a value.
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ Mul
Product of integers.
void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
std::function< bool(const LegalityQuery &)> LegalityPredicate
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
static const fltSemantics & IEEEsingle() LLVM_READNONE
static const fltSemantics & IEEEdouble() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.