34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
37#define DEBUG_TYPE "amdgpu-legalinfo"
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
47 "amdgpu-global-isel-new-legality",
48 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
74 const LLT Ty = Query.Types[TypeIdx];
81 EltSize > 1 && EltSize < 32 &&
88 const LLT Ty = Query.Types[TypeIdx];
95 const LLT Ty = Query.Types[TypeIdx];
103 const LLT Ty = Query.Types[TypeIdx];
105 return std::pair(TypeIdx,
112 const LLT Ty = Query.Types[TypeIdx];
115 unsigned Pieces = (
Size + 63) / 64;
126 const LLT Ty = Query.Types[TypeIdx];
131 const int NextMul32 = (
Size + 31) / 32;
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
143 const LLT Ty = Query.Types[TypeIdx];
148 assert(EltSize == 32 || EltSize == 64);
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
190 const LLT Ty = Query.Types[TypeIdx];
197 const LLT Ty = Query.Types[TypeIdx];
207 const LLT QueryTy = Query.Types[TypeIdx];
214 const LLT QueryTy = Query.Types[TypeIdx];
221 const LLT QueryTy = Query.Types[TypeIdx];
232 return EltSize == 16 || EltSize % 32 == 0;
237 return EltSize == 32 || EltSize == 64 ||
239 EltSize == 128 || EltSize == 256;
266 LLT Ty = Query.Types[TypeIdx];
274 const LLT QueryTy = Query.Types[TypeIdx];
367 const LLT Ty = Query.Types[TypeIdx];
369 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.
getSizeInBits();
377 bool IsLoad,
bool IsAtomic) {
381 return ST.enableFlatScratch() ? 128 : 32;
383 return ST.useDS128() ? 128 : 64;
394 return IsLoad ? 512 : 128;
399 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
408 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
413 unsigned AS = Query.
Types[1].getAddressSpace();
427 if (IsLoad && MemSize <
Size)
428 MemSize = std::max(MemSize,
Align);
437 AtomicOrdering::NotAtomic))
448 if (!ST.hasDwordx3LoadStores())
461 if (AlignBits < MemSize) {
464 Align(AlignBits / 8)))
507 return EltSize != 32 && EltSize != 64;
522 if (
Size != MemSizeInBits)
538 uint64_t AlignInBits,
unsigned AddrSpace,
548 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
559 if (AlignInBits < RoundedSize)
566 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
573 if (Query.
MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
578 Query.
Types[1].getAddressSpace(), Opcode);
598 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
601 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
602 std::array<Register, 4> VectorElems;
603 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
604 for (
unsigned I = 0;
I < NumParts; ++
I)
606 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
607 B.buildMergeValues(MO, VectorElems);
611 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
612 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
613 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
614 B.buildIntToPtr(MO, Scalar);
634 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
635 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
636 for (
unsigned I = 0;
I < NumParts; ++
I)
638 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
640 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
641 return B.buildBitcast(VectorTy, Scalar).getReg(0);
658 using namespace TargetOpcode;
660 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
673 const LLT BufferStridedPtr =
676 const LLT CodePtr = FlatPtr;
678 const std::initializer_list<LLT> AddrSpaces64 = {
679 GlobalPtr, ConstantPtr, FlatPtr
682 const std::initializer_list<LLT> AddrSpaces32 = {
683 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
686 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
688 const std::initializer_list<LLT> FPTypesBase = {
692 const std::initializer_list<LLT> FPTypes16 = {
696 const std::initializer_list<LLT> FPTypesPK16 = {
726 .clampMaxNumElementsStrict(0,
S16, 2)
734 .clampMaxNumElementsStrict(0,
S16, 2)
744 .clampMaxNumElementsStrict(0,
S16, 2)
752 .clampMaxNumElementsStrict(0,
S16, 2)
762 .minScalarOrElt(0,
S16)
779 .widenScalarToNextMultipleOf(0, 32)
801 .widenScalarToNextMultipleOf(0, 32)
809 .widenScalarToNextMultipleOf(0, 32);
820 .minScalarOrElt(0,
S32)
839 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
851 .clampMaxNumElements(0,
S8, 2)
870 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
882 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
889 .clampScalar(0,
S16,
S64);
921 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
945 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
979 .legalFor(FPTypesPK16)
993 .clampScalar(0,
S16,
S64);
1018 .clampScalar(0,
S32,
S64);
1023 .clampScalar(0,
S32,
S64);
1029 .clampScalar(0,
S32,
S64)
1030 .clampScalar(1,
S32,
S32)
1037 .clampScalar(1,
S32,
S32)
1073 FMad.customFor({
S32,
S16});
1075 FMad.customFor({
S32});
1077 FMad.customFor({
S16});
1085 FRem.minScalar(0,
S32)
1094 .clampMaxNumElements(0,
S16, 2)
1105 .clampScalar(0,
S32,
S64)
1106 .widenScalarToNextPow2(1, 32);
1134 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1135 .clampScalar(0,
S16,
S64)
1139 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1145 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1149 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1150 .clampScalar(0,
S16,
S64)
1154 if (
ST.has16BitInsts()) {
1155 getActionDefinitionsBuilder(
1156 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1158 .clampScalar(0,
S16,
S64)
1161 getActionDefinitionsBuilder(
1162 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1164 .clampScalar(0,
S32,
S64)
1167 getActionDefinitionsBuilder(
1168 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1171 .clampScalar(0,
S32,
S64)
1175 getActionDefinitionsBuilder(G_PTR_ADD)
1176 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1179 .scalarSameSizeAs(1, 0);
1181 getActionDefinitionsBuilder(G_PTRMASK)
1183 .scalarSameSizeAs(1, 0)
1187 getActionDefinitionsBuilder(G_ICMP)
1198 .legalForCartesianProduct(
1199 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1200 .legalForCartesianProduct(
1201 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1202 if (
ST.has16BitInsts()) {
1203 CmpBuilder.legalFor({{
S1,
S16}});
1207 .widenScalarToNextPow2(1)
1208 .clampScalar(1,
S32,
S64)
1213 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1214 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1216 if (
ST.hasSALUFloatInsts())
1217 FCmpBuilder.legalForCartesianProduct({
S32}, {
S16,
S32});
1220 .widenScalarToNextPow2(1)
1221 .clampScalar(1,
S32,
S64)
1225 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1226 if (
ST.has16BitInsts())
1227 ExpOps.customFor({{
S32}, {
S16}});
1229 ExpOps.customFor({
S32});
1230 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1233 getActionDefinitionsBuilder(G_FPOWI)
1234 .clampScalar(0, MinScalarFPTy,
S32)
1237 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1238 Log2Ops.customFor({
S32});
1239 if (
ST.has16BitInsts())
1240 Log2Ops.legalFor({
S16});
1242 Log2Ops.customFor({
S16});
1243 Log2Ops.scalarize(0)
1247 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1248 LogOps.customFor({
S32,
S16});
1249 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1253 getActionDefinitionsBuilder(G_CTPOP)
1255 .clampScalar(0,
S32,
S32)
1256 .widenScalarToNextPow2(1, 32)
1257 .clampScalar(1,
S32,
S64)
1259 .widenScalarToNextPow2(0, 32);
1262 if (
ST.has16BitInsts())
1263 getActionDefinitionsBuilder(G_IS_FPCLASS)
1264 .legalForCartesianProduct({
S1}, FPTypes16)
1265 .widenScalarToNextPow2(1)
1269 getActionDefinitionsBuilder(G_IS_FPCLASS)
1270 .legalForCartesianProduct({
S1}, FPTypesBase)
1271 .lowerFor({
S1,
S16})
1272 .widenScalarToNextPow2(1)
1279 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1281 .clampScalar(0,
S32,
S32)
1282 .clampScalar(1,
S32,
S64)
1283 .widenScalarToNextPow2(0, 32)
1284 .widenScalarToNextPow2(1, 32)
1288 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1291 .clampScalar(0,
S32,
S32)
1292 .clampScalar(1,
S32,
S64)
1294 .widenScalarToNextPow2(0, 32)
1295 .widenScalarToNextPow2(1, 32);
1297 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1299 .clampScalar(0,
S32,
S32)
1300 .clampScalar(1,
S32,
S64)
1302 .widenScalarToNextPow2(0, 32)
1303 .widenScalarToNextPow2(1, 32);
1307 getActionDefinitionsBuilder(G_BITREVERSE)
1309 .clampScalar(0,
S32,
S64)
1311 .widenScalarToNextPow2(0);
1313 if (
ST.has16BitInsts()) {
1314 getActionDefinitionsBuilder(G_BSWAP)
1316 .clampMaxNumElementsStrict(0,
S16, 2)
1319 .widenScalarToNextPow2(0)
1320 .clampScalar(0,
S16,
S32)
1323 if (
ST.hasVOP3PInsts()) {
1324 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1326 .clampMaxNumElements(0,
S16, 2)
1328 .widenScalarToNextPow2(0)
1332 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1334 .widenScalarToNextPow2(0)
1341 getActionDefinitionsBuilder(G_BSWAP)
1346 .widenScalarToNextPow2(0)
1351 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1354 .widenScalarToNextPow2(0)
1359 getActionDefinitionsBuilder(G_INTTOPTR)
1361 .legalForCartesianProduct(AddrSpaces64, {
S64})
1362 .legalForCartesianProduct(AddrSpaces32, {
S32})
1375 getActionDefinitionsBuilder(G_PTRTOINT)
1377 .legalForCartesianProduct(AddrSpaces64, {
S64})
1378 .legalForCartesianProduct(AddrSpaces32, {
S32})
1391 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1395 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1396 bool IsLoad) ->
bool {
1400 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1414 unsigned NumRegs = (MemSize + 31) / 32;
1416 if (!
ST.hasDwordx3LoadStores())
1427 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1428 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1429 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1435 for (
unsigned Op : {G_LOAD, G_STORE}) {
1436 const bool IsStore =
Op == G_STORE;
1438 auto &Actions = getActionDefinitionsBuilder(
Op);
1441 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1444 {
S64, GlobalPtr,
S64, GlobalAlign32},
1447 {
S32, GlobalPtr,
S8, GlobalAlign8},
1448 {
S32, GlobalPtr,
S16, GlobalAlign16},
1450 {
S32, LocalPtr,
S32, 32},
1451 {
S64, LocalPtr,
S64, 32},
1453 {
S32, LocalPtr,
S8, 8},
1454 {
S32, LocalPtr,
S16, 16},
1457 {
S32, PrivatePtr,
S32, 32},
1458 {
S32, PrivatePtr,
S8, 8},
1459 {
S32, PrivatePtr,
S16, 16},
1462 {
S32, ConstantPtr,
S32, GlobalAlign32},
1465 {
S64, ConstantPtr,
S64, GlobalAlign32},
1466 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1475 Actions.unsupportedIf(
1476 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1490 Actions.customIf(
typeIs(1, Constant32Ptr));
1516 return !Query.
Types[0].isVector() &&
1517 needToSplitMemOp(Query,
Op == G_LOAD);
1519 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1524 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1527 if (DstSize > MemSize)
1533 if (MemSize > MaxSize)
1541 return Query.
Types[0].isVector() &&
1542 needToSplitMemOp(Query,
Op == G_LOAD);
1544 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1558 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1559 if (MemSize > MaxSize) {
1563 if (MaxSize % EltSize == 0) {
1569 unsigned NumPieces = MemSize / MaxSize;
1573 if (NumPieces == 1 || NumPieces >= NumElts ||
1574 NumElts % NumPieces != 0)
1575 return std::pair(0, EltTy);
1583 return std::pair(0, EltTy);
1598 return std::pair(0, EltTy);
1602 .widenScalarToNextPow2(0)
1608 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1609 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1610 {
S32, GlobalPtr,
S16, 2 * 8},
1611 {
S32, LocalPtr,
S8, 8},
1612 {
S32, LocalPtr,
S16, 16},
1613 {
S32, PrivatePtr,
S8, 8},
1614 {
S32, PrivatePtr,
S16, 16},
1615 {
S32, ConstantPtr,
S8, 8},
1616 {
S32, ConstantPtr,
S16, 2 * 8}})
1622 if (
ST.hasFlatAddressSpace()) {
1623 ExtLoads.legalForTypesWithMemDesc(
1624 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1632 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1634 ExtLoads.clampScalar(0,
S32,
S32)
1635 .widenScalarToNextPow2(0)
1638 auto &Atomics = getActionDefinitionsBuilder(
1639 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1640 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1641 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1642 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1643 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1644 {
S64, GlobalPtr}, {
S64, LocalPtr},
1645 {
S32, RegionPtr}, {
S64, RegionPtr}});
1646 if (
ST.hasFlatAddressSpace()) {
1647 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1651 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1652 if (
ST.hasLDSFPAtomicAddF32()) {
1653 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1654 if (
ST.hasLdsAtomicAddF64())
1655 Atomic.legalFor({{
S64, LocalPtr}});
1656 if (
ST.hasAtomicDsPkAdd16Insts())
1657 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1659 if (
ST.hasAtomicFaddInsts())
1660 Atomic.legalFor({{
S32, GlobalPtr}});
1661 if (
ST.hasFlatAtomicFaddF32Inst())
1662 Atomic.legalFor({{
S32, FlatPtr}});
1664 if (
ST.hasGFX90AInsts()) {
1675 if (
ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1676 ST.hasAtomicBufferGlobalPkAddF16Insts())
1677 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1678 if (
ST.hasAtomicGlobalPkAddBF16Inst())
1679 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1680 if (
ST.hasAtomicFlatPkAdd16Insts())
1681 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1686 auto &AtomicFMinFMax =
1687 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1688 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1690 if (
ST.hasAtomicFMinFMaxF32GlobalInsts())
1691 AtomicFMinFMax.legalFor({{
F32, GlobalPtr},{
F32, BufferFatPtr}});
1692 if (
ST.hasAtomicFMinFMaxF64GlobalInsts())
1693 AtomicFMinFMax.legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1694 if (
ST.hasAtomicFMinFMaxF32FlatInsts())
1695 AtomicFMinFMax.legalFor({
F32, FlatPtr});
1696 if (
ST.hasAtomicFMinFMaxF64FlatInsts())
1697 AtomicFMinFMax.legalFor({
F64, FlatPtr});
1701 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1702 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1703 {
S32, FlatPtr}, {
S64, FlatPtr}})
1704 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1705 {
S32, RegionPtr}, {
S64, RegionPtr}});
1709 getActionDefinitionsBuilder(G_SELECT)
1711 LocalPtr, FlatPtr, PrivatePtr,
1715 .clampScalar(0,
S16,
S64)
1719 .clampMaxNumElements(0,
S32, 2)
1720 .clampMaxNumElements(0, LocalPtr, 2)
1721 .clampMaxNumElements(0, PrivatePtr, 2)
1723 .widenScalarToNextPow2(0)
1728 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1730 if (
ST.has16BitInsts()) {
1731 if (
ST.hasVOP3PInsts()) {
1733 .clampMaxNumElements(0,
S16, 2);
1735 Shifts.legalFor({{
S16,
S16}});
1738 Shifts.widenScalarIf(
1743 const LLT AmountTy = Query.
Types[1];
1748 Shifts.clampScalar(1,
S32,
S32);
1749 Shifts.widenScalarToNextPow2(0, 16);
1750 Shifts.clampScalar(0,
S16,
S64);
1752 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1760 Shifts.clampScalar(1,
S32,
S32);
1761 Shifts.widenScalarToNextPow2(0, 32);
1762 Shifts.clampScalar(0,
S32,
S64);
1764 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1769 Shifts.scalarize(0);
1771 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1772 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1773 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1774 unsigned IdxTypeIdx = 2;
1776 getActionDefinitionsBuilder(
Op)
1778 const LLT EltTy = Query.
Types[EltTypeIdx];
1779 const LLT VecTy = Query.
Types[VecTypeIdx];
1780 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1782 const bool isLegalVecType =
1792 return (EltSize == 32 || EltSize == 64) &&
1807 const LLT EltTy = Query.
Types[EltTypeIdx];
1808 const LLT VecTy = Query.
Types[VecTypeIdx];
1812 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1817 .clampScalar(EltTypeIdx,
S32,
S64)
1818 .clampScalar(VecTypeIdx,
S32,
S64)
1819 .clampScalar(IdxTypeIdx,
S32,
S32)
1820 .clampMaxNumElements(VecTypeIdx,
S32, 32)
1830 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1832 const LLT &EltTy = Query.
Types[1].getElementType();
1833 return Query.
Types[0] != EltTy;
1836 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1837 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1838 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1841 getActionDefinitionsBuilder(
Op)
1847 const LLT BigTy = Query.
Types[BigTyIdx];
1852 const LLT BigTy = Query.
Types[BigTyIdx];
1853 const LLT LitTy = Query.
Types[LitTyIdx];
1859 const LLT BigTy = Query.
Types[BigTyIdx];
1865 const LLT LitTy = Query.
Types[LitTyIdx];
1870 .widenScalarToNextPow2(BigTyIdx, 32);
1874 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1884 if (
ST.hasScalarPackInsts()) {
1887 .minScalarOrElt(0,
S16)
1890 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1894 BuildVector.customFor({
V2S16,
S16});
1895 BuildVector.minScalarOrElt(0,
S32);
1897 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1905 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1907 .clampMaxNumElements(0,
S32, 32)
1908 .clampMaxNumElements(1,
S16, 2)
1909 .clampMaxNumElements(0,
S16, 64);
1911 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1914 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1915 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1916 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1918 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1919 const LLT Ty = Query.
Types[TypeIdx];
1930 auto &Builder = getActionDefinitionsBuilder(
Op)
1934 const LLT BigTy = Query.
Types[BigTyIdx];
1940 .widenScalarToNextPow2(LitTyIdx, 16)
1948 .clampScalar(LitTyIdx,
S32,
S512)
1949 .widenScalarToNextPow2(LitTyIdx, 32)
1952 [=](
const LegalityQuery &Query) {
return notValidElt(Query, LitTyIdx); },
1955 [=](
const LegalityQuery &Query) {
return notValidElt(Query, BigTyIdx); },
1959 if (
Op == G_MERGE_VALUES) {
1960 Builder.widenScalarIf(
1963 const LLT Ty = Query.
Types[LitTyIdx];
1969 Builder.widenScalarIf(
1971 const LLT Ty = Query.
Types[BigTyIdx];
1977 const LLT &Ty = Query.
Types[BigTyIdx];
1979 if (NewSizeInBits >= 256) {
1981 if (RoundedTo < NewSizeInBits)
1982 NewSizeInBits = RoundedTo;
1984 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
1993 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1994 .legalFor({{
S32}, {
S64}});
1996 if (
ST.hasVOP3PInsts()) {
1997 SextInReg.lowerFor({{
V2S16}})
2001 .clampMaxNumElementsStrict(0,
S16, 2);
2002 }
else if (
ST.has16BitInsts()) {
2003 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2007 SextInReg.lowerFor({{
S32}, {
S64}});
2012 .clampScalar(0,
S32,
S64)
2015 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2020 getActionDefinitionsBuilder(G_FSHR)
2023 .clampMaxNumElementsStrict(0,
S16, 2)
2027 if (
ST.hasVOP3PInsts()) {
2028 getActionDefinitionsBuilder(G_FSHL)
2030 .clampMaxNumElementsStrict(0,
S16, 2)
2034 getActionDefinitionsBuilder(G_FSHL)
2039 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2042 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
2044 getActionDefinitionsBuilder(G_FENCE)
2047 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2052 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2054 .clampScalar(1,
S32,
S32)
2055 .clampScalar(0,
S32,
S64)
2056 .widenScalarToNextPow2(0)
2059 getActionDefinitionsBuilder(
2063 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2064 G_READ_REGISTER, G_WRITE_REGISTER,
2069 if (
ST.hasIEEEMinMax()) {
2070 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2071 .legalFor(FPTypesPK16)
2072 .clampMaxNumElements(0,
S16, 2)
2076 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2079 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2082 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2084 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2085 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2086 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2089 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2091 getLegacyLegalizerInfo().computeTables();
2101 switch (
MI.getOpcode()) {
2102 case TargetOpcode::G_ADDRSPACE_CAST:
2104 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2106 case TargetOpcode::G_FCEIL:
2108 case TargetOpcode::G_FREM:
2110 case TargetOpcode::G_INTRINSIC_TRUNC:
2112 case TargetOpcode::G_SITOFP:
2114 case TargetOpcode::G_UITOFP:
2116 case TargetOpcode::G_FPTOSI:
2118 case TargetOpcode::G_FPTOUI:
2120 case TargetOpcode::G_FMINNUM:
2121 case TargetOpcode::G_FMAXNUM:
2122 case TargetOpcode::G_FMINNUM_IEEE:
2123 case TargetOpcode::G_FMAXNUM_IEEE:
2125 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2127 case TargetOpcode::G_INSERT_VECTOR_ELT:
2129 case TargetOpcode::G_FSIN:
2130 case TargetOpcode::G_FCOS:
2132 case TargetOpcode::G_GLOBAL_VALUE:
2134 case TargetOpcode::G_LOAD:
2135 case TargetOpcode::G_SEXTLOAD:
2136 case TargetOpcode::G_ZEXTLOAD:
2138 case TargetOpcode::G_STORE:
2140 case TargetOpcode::G_FMAD:
2142 case TargetOpcode::G_FDIV:
2144 case TargetOpcode::G_FFREXP:
2146 case TargetOpcode::G_FSQRT:
2148 case TargetOpcode::G_UDIV:
2149 case TargetOpcode::G_UREM:
2150 case TargetOpcode::G_UDIVREM:
2152 case TargetOpcode::G_SDIV:
2153 case TargetOpcode::G_SREM:
2154 case TargetOpcode::G_SDIVREM:
2156 case TargetOpcode::G_ATOMIC_CMPXCHG:
2158 case TargetOpcode::G_FLOG2:
2160 case TargetOpcode::G_FLOG:
2161 case TargetOpcode::G_FLOG10:
2163 case TargetOpcode::G_FEXP2:
2165 case TargetOpcode::G_FEXP:
2166 case TargetOpcode::G_FEXP10:
2168 case TargetOpcode::G_FPOW:
2170 case TargetOpcode::G_FFLOOR:
2172 case TargetOpcode::G_BUILD_VECTOR:
2173 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2175 case TargetOpcode::G_MUL:
2177 case TargetOpcode::G_CTLZ:
2178 case TargetOpcode::G_CTTZ:
2180 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2182 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2184 case TargetOpcode::G_STACKSAVE:
2186 case TargetOpcode::G_GET_FPENV:
2188 case TargetOpcode::G_SET_FPENV:
2190 case TargetOpcode::G_TRAP:
2192 case TargetOpcode::G_DEBUGTRAP:
2212 if (ST.hasApertureRegs()) {
2217 ? AMDGPU::SRC_SHARED_BASE
2218 : AMDGPU::SRC_PRIVATE_BASE;
2227 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2228 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {
Register(ApertureRegNo)});
2229 return B.buildUnmerge(
S32, Dst).getReg(1);
2234 Register LoadAddr =
MRI.createGenericVirtualRegister(
2244 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2246 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2260 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2263 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2266 Register QueuePtr =
MRI.createGenericVirtualRegister(
2282 B.buildPtrAdd(LoadAddr, QueuePtr,
2283 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2284 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2292 switch (Def->getOpcode()) {
2293 case AMDGPU::G_FRAME_INDEX:
2294 case AMDGPU::G_GLOBAL_VALUE:
2295 case AMDGPU::G_BLOCK_ADDR:
2297 case AMDGPU::G_CONSTANT: {
2298 const ConstantInt *CI = Def->getOperand(1).getCImm();
2299 return CI->
getSExtValue() != TM.getNullPointerValue(AddrSpace);
2315 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2316 (isa<GIntrinsic>(
MI) && cast<GIntrinsic>(
MI).getIntrinsicID() ==
2317 Intrinsic::amdgcn_addrspacecast_nonnull));
2321 Register Src = isa<GIntrinsic>(
MI) ?
MI.getOperand(2).getReg()
2322 :
MI.getOperand(1).getReg();
2323 LLT DstTy =
MRI.getType(Dst);
2324 LLT SrcTy =
MRI.getType(Src);
2335 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2336 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2347 B.buildExtract(Dst, Src, 0);
2348 MI.eraseFromParent();
2352 unsigned NullVal = TM.getNullPointerValue(DestAS);
2354 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2355 auto FlatNull =
B.buildConstant(SrcTy, 0);
2358 auto PtrLo32 =
B.buildExtract(DstTy, Src, 0);
2362 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2364 MI.eraseFromParent();
2371 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2378 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2382 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2388 castLocalOrPrivateToFlat(Dst);
2389 MI.eraseFromParent();
2393 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2395 auto SegmentNull =
B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2396 auto FlatNull =
B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2399 SegmentNull.getReg(0));
2401 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2403 MI.eraseFromParent();
2410 B.buildExtract(Dst, Src, 0);
2411 MI.eraseFromParent();
2419 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2420 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2421 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2422 MI.eraseFromParent();
2427 MF.
getFunction(),
"invalid addrspacecast",
B.getDebugLoc());
2430 Ctx.
diagnose(InvalidAddrSpaceCast);
2432 MI.eraseFromParent();
2440 LLT Ty =
MRI.getType(Src);
2446 auto C1 =
B.buildFConstant(Ty, C1Val);
2447 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2450 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2451 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2453 auto C2 =
B.buildFConstant(Ty, C2Val);
2454 auto Fabs =
B.buildFAbs(Ty, Src);
2457 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2458 MI.eraseFromParent();
2476 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2478 const auto Zero =
B.buildFConstant(
S64, 0.0);
2479 const auto One =
B.buildFConstant(
S64, 1.0);
2482 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2483 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2486 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2487 MI.eraseFromParent();
2495 Register Src0Reg =
MI.getOperand(1).getReg();
2496 Register Src1Reg =
MI.getOperand(2).getReg();
2497 auto Flags =
MI.getFlags();
2498 LLT Ty =
MRI.getType(DstReg);
2500 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2501 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2502 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2503 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2504 MI.eraseFromParent();
2510 const unsigned FractBits = 52;
2511 const unsigned ExpBits = 11;
2514 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2515 auto Const1 =
B.buildConstant(
S32, ExpBits);
2517 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2519 .addUse(Const0.getReg(0))
2520 .addUse(Const1.getReg(0));
2522 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2536 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2543 const unsigned FractBits = 52;
2546 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2547 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2549 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2551 const auto Zero32 =
B.buildConstant(
S32, 0);
2554 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2556 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2557 auto Not =
B.buildNot(
S64, Shr);
2558 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2559 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2564 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2565 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2566 MI.eraseFromParent();
2582 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2583 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2585 if (
MRI.getType(Dst) ==
S64) {
2586 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2587 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2589 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2590 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2593 B.buildFAdd(Dst, LdExp, CvtLo);
2594 MI.eraseFromParent();
2600 auto One =
B.buildConstant(
S32, 1);
2604 auto ThirtyOne =
B.buildConstant(
S32, 31);
2605 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2606 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2607 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2608 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2609 .addUse(Unmerge.getReg(1));
2610 auto LS2 =
B.buildSub(
S32, LS, One);
2611 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2613 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2614 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2615 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2616 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2617 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2618 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2619 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2620 B.buildFLdexp(Dst, FVal, Scale);
2621 MI.eraseFromParent();
2638 const LLT SrcLT =
MRI.getType(Src);
2641 unsigned Flags =
MI.getFlags();
2652 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2660 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2661 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2665 K0 =
B.buildFConstant(
2666 S64, llvm::bit_cast<double>(UINT64_C( 0x3df0000000000000)));
2667 K1 =
B.buildFConstant(
2668 S64, llvm::bit_cast<double>(UINT64_C( 0xc1f0000000000000)));
2670 K0 =
B.buildFConstant(
2671 S32, llvm::bit_cast<float>(UINT32_C( 0x2f800000)));
2672 K1 =
B.buildFConstant(
2673 S32, llvm::bit_cast<float>(UINT32_C( 0xcf800000)));
2676 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2677 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2678 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2681 :
B.buildFPTOUI(
S32, FloorMul);
2682 auto Lo =
B.buildFPTOUI(
S32, Fma);
2686 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2688 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2691 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2692 MI.eraseFromParent();
2702 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2703 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2726 LLT VecTy =
MRI.getType(Vec);
2739 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2740 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2741 B.buildIntToPtr(Dst, IntElt);
2743 MI.eraseFromParent();
2750 std::optional<ValueAndVReg> MaybeIdxVal =
2754 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2757 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2758 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2763 MI.eraseFromParent();
2778 LLT VecTy =
MRI.getType(Vec);
2792 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2793 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2794 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2796 B.buildIntToPtr(Dst, IntVecDest);
2797 MI.eraseFromParent();
2804 std::optional<ValueAndVReg> MaybeIdxVal =
2809 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2812 if (IdxVal < NumElts) {
2814 for (
unsigned i = 0; i < NumElts; ++i)
2815 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2816 B.buildUnmerge(SrcRegs, Vec);
2818 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2819 B.buildMergeLikeInstr(Dst, SrcRegs);
2824 MI.eraseFromParent();
2834 LLT Ty =
MRI.getType(DstReg);
2835 unsigned Flags =
MI.getFlags();
2840 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2841 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2842 .addUse(MulVal.getReg(0))
2846 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2849 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2853 MI.eraseFromParent();
2861 unsigned GAFlags)
const {
2862 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
2890 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2901 if (!
B.getMRI()->getRegClassOrNull(PCReg))
2902 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2905 B.buildExtract(DstReg, PCReg, 0);
2919 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
2921 :
MRI.createGenericVirtualRegister(
S32);
2923 if (!
MRI.getRegClassOrNull(AddrLo))
2924 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2927 B.buildInstr(AMDGPU::S_MOV_B32)
2932 if (RequiresHighHalf) {
2934 "Must provide a 64-bit pointer type!");
2937 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2939 B.buildInstr(AMDGPU::S_MOV_B32)
2949 if (!
MRI.getRegClassOrNull(AddrDst))
2950 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2952 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2956 if (AddrDst != DstReg)
2957 B.buildCast(DstReg, AddrDst);
2958 }
else if (AddrLo != DstReg) {
2961 B.buildCast(DstReg, AddrLo);
2969 LLT Ty =
MRI.getType(DstReg);
2978 GV->
getName() !=
"llvm.amdgcn.module.lds") {
2981 Fn,
"local memory global used by non-kernel function",
MI.getDebugLoc(),
2991 B.buildUndef(DstReg);
2992 MI.eraseFromParent();
3012 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3016 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3017 B.buildIntToPtr(DstReg, Sz);
3018 MI.eraseFromParent();
3024 *cast<GlobalVariable>(GV)));
3025 MI.eraseFromParent();
3031 MI.eraseFromParent();
3039 MI.eraseFromParent();
3045 MI.eraseFromParent();
3050 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
3063 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3064 B.buildExtract(DstReg, Load, 0);
3066 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3068 MI.eraseFromParent();
3086 LLT PtrTy =
MRI.getType(PtrReg);
3091 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3093 MI.getOperand(1).setReg(Cast.getReg(0));
3098 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3102 LLT ValTy =
MRI.getType(ValReg);
3124 if (WideMemSize == ValSize) {
3130 MI.setMemRefs(MF, {WideMMO});
3136 if (ValSize > WideMemSize)
3143 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3144 B.buildTrunc(ValReg, WideLoad).getReg(0);
3151 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3152 B.buildExtract(ValReg, WideLoad, 0);
3156 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3157 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3161 MI.eraseFromParent();
3174 Register DataReg =
MI.getOperand(0).getReg();
3175 LLT DataTy =
MRI.getType(DataReg);
3189 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3218 "this should not have been custom lowered");
3220 LLT ValTy =
MRI.getType(CmpVal);
3223 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3225 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3229 .setMemRefs(
MI.memoperands());
3231 MI.eraseFromParent();
3240 case TargetOpcode::G_INTRINSIC: {
3242 case Intrinsic::amdgcn_frexp_mant:
3250 case TargetOpcode::G_FFREXP: {
3255 case TargetOpcode::G_FPEXT: {
3279std::pair<Register, Register>
3281 unsigned Flags)
const {
3286 auto SmallestNormal =
B.buildFConstant(
3288 auto IsLtSmallestNormal =
3291 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3292 auto One =
B.buildFConstant(
F32, 1.0);
3294 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3295 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3297 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3310 LLT Ty =
B.getMRI()->getType(Dst);
3311 unsigned Flags =
MI.getFlags();
3316 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3317 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3318 .addUse(Ext.getReg(0))
3320 B.buildFPTrunc(Dst,
Log2, Flags);
3321 MI.eraseFromParent();
3329 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3332 MI.eraseFromParent();
3336 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3337 .addUse(ScaledInput)
3340 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3341 auto Zero =
B.buildFConstant(Ty, 0.0);
3343 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3344 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3346 MI.eraseFromParent();
3352 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3353 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3358 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3359 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3364 unsigned Flags =
MI.getFlags();
3365 const LLT Ty =
MRI.getType(
X);
3375 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3378 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3380 B.buildFPTrunc(Dst, LogVal);
3385 MI.eraseFromParent();
3394 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3399 const float c_log10 = 0x1.344134p-2f;
3400 const float cc_log10 = 0x1.09f79ep-26f;
3403 const float c_log = 0x1.62e42ep-1f;
3404 const float cc_log = 0x1.efa39ep-25f;
3406 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3407 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3409 R =
B.buildFMul(Ty,
Y,
C, Flags).getReg(0);
3410 auto NegR =
B.buildFNeg(Ty, R, Flags);
3411 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, Flags);
3412 auto FMA1 =
B.buildFMA(Ty,
Y,
CC, FMA0, Flags);
3413 R =
B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3416 const float ch_log10 = 0x1.344000p-2f;
3417 const float ct_log10 = 0x1.3509f6p-18f;
3420 const float ch_log = 0x1.62e000p-1f;
3421 const float ct_log = 0x1.0bfbe8p-15f;
3423 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3424 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3426 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3427 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3428 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3429 auto YTCT =
B.buildFMul(Ty, YT, CT, Flags);
3432 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3434 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, Flags);
3437 const bool IsFiniteOnly =
3441 if (!IsFiniteOnly) {
3444 auto Fabs =
B.buildFAbs(Ty,
Y);
3447 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3451 auto Zero =
B.buildFConstant(Ty, 0.0);
3453 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3454 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3455 B.buildFSub(Dst, R, Shift, Flags);
3457 B.buildCopy(Dst, R);
3460 MI.eraseFromParent();
3466 unsigned Flags)
const {
3467 const double Log2BaseInverted =
3470 LLT Ty =
B.getMRI()->getType(Dst);
3475 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3478 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3479 auto Zero =
B.buildFConstant(Ty, 0.0);
3481 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3482 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3485 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3487 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3488 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3496 ?
B.buildFLog2(Ty, Src, Flags)
3497 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3500 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3501 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3512 unsigned Flags =
MI.getFlags();
3513 LLT Ty =
B.getMRI()->getType(Dst);
3519 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3520 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3521 .addUse(Ext.getReg(0))
3523 B.buildFPTrunc(Dst,
Log2, Flags);
3524 MI.eraseFromParent();
3534 MI.eraseFromParent();
3542 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3544 RangeCheckConst, Flags);
3546 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3547 auto Zero =
B.buildFConstant(Ty, 0.0);
3548 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3549 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3551 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3552 .addUse(AddInput.getReg(0))
3555 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3556 auto One =
B.buildFConstant(Ty, 1.0);
3557 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3558 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3559 MI.eraseFromParent();
3565 LLT Ty =
B.getMRI()->getType(Dst);
3570 auto Mul =
B.buildFMul(Ty,
X, Log2E, Flags);
3574 .addUse(
Mul.getReg(0))
3577 B.buildFExp2(Dst,
Mul.getReg(0), Flags);
3583 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3586 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3587 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3588 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3591 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3593 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3594 .addUse(ExpInput.getReg(0))
3597 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3598 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3599 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3607 const unsigned Flags =
MI.getFlags();
3610 LLT Ty =
MRI.getType(Dst);
3613 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3620 MI.eraseFromParent();
3628 auto Ext =
B.buildFPExt(
F32,
X, Flags);
3631 B.buildFPTrunc(Dst, Lowered, Flags);
3632 MI.eraseFromParent();
3642 MI.eraseFromParent();
3670 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3675 const float cc_exp = 0x1.4ae0bep-26f;
3676 const float c_exp10 = 0x1.a934f0p+1f;
3677 const float cc_exp10 = 0x1.2f346ep-24f;
3679 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3680 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3681 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3682 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3684 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3685 PL =
B.buildFMA(Ty,
X,
CC, FMA0, Flags).getReg(0);
3687 const float ch_exp = 0x1.714000p+0f;
3688 const float cl_exp = 0x1.47652ap-12f;
3690 const float ch_exp10 = 0x1.a92000p+1f;
3691 const float cl_exp10 = 0x1.4f0978p-11f;
3693 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3694 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3695 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3697 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3698 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3700 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3701 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3704 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3705 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3708 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3711 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
3712 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3715 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3716 .addUse(
A.getReg(0))
3718 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3720 auto UnderflowCheckConst =
3721 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3722 auto Zero =
B.buildFConstant(Ty, 0.0);
3726 R =
B.buildSelect(Ty, Underflow, Zero, R);
3731 auto OverflowCheckConst =
3732 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3737 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3740 B.buildCopy(Dst, R);
3741 MI.eraseFromParent();
3750 unsigned Flags =
MI.getFlags();
3751 LLT Ty =
B.getMRI()->getType(Dst);
3756 auto Log =
B.buildFLog2(
F32, Src0, Flags);
3757 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3758 .addUse(Log.getReg(0))
3761 B.buildFExp2(Dst,
Mul, Flags);
3762 }
else if (Ty == F16) {
3764 auto Log =
B.buildFLog2(F16, Src0, Flags);
3765 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
3766 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
3767 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3768 .addUse(Ext0.getReg(0))
3769 .addUse(Ext1.getReg(0))
3771 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
3775 MI.eraseFromParent();
3783 ModSrc = SrcFNeg->getOperand(1).getReg();
3785 ModSrc = SrcFAbs->getOperand(1).getReg();
3787 ModSrc = SrcFAbs->getOperand(1).getReg();
3798 Register OrigSrc =
MI.getOperand(1).getReg();
3799 unsigned Flags =
MI.getFlags();
3801 "this should not have been custom lowered");
3811 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
3823 B.buildFConstant(
F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3831 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3833 B.buildFMinNum(Min, Fract, Const, Flags);
3838 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
3841 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
3842 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3844 MI.eraseFromParent();
3860 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3862 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
3863 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
3866 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
3867 B.buildBitcast(Dst,
Merge);
3869 MI.eraseFromParent();
3886 bool UsePartialMad64_32,
3887 bool SeparateOddAlignedProducts)
const {
3902 auto getZero32 = [&]() ->
Register {
3904 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
3907 auto getZero64 = [&]() ->
Register {
3909 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
3914 for (
unsigned i = 0; i < Src0.
size(); ++i) {
3925 if (CarryIn.empty())
3928 bool HaveCarryOut =
true;
3930 if (CarryIn.size() == 1) {
3932 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3936 CarryAccum = getZero32();
3938 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3939 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3941 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
3946 LocalAccum = getZero32();
3947 HaveCarryOut =
false;
3952 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
3953 LocalAccum =
Add.getReg(0);
3967 auto buildMadChain =
3970 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
3971 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
3978 if (LocalAccum.size() == 1 &&
3979 (!UsePartialMad64_32 || !CarryIn.empty())) {
3982 unsigned j1 = DstIndex - j0;
3983 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3987 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
3989 LocalAccum[0] =
Mul.getReg(0);
3991 if (CarryIn.empty()) {
3992 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
3995 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4001 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4005 if (j0 <= DstIndex) {
4006 bool HaveSmallAccum =
false;
4009 if (LocalAccum[0]) {
4010 if (LocalAccum.size() == 1) {
4011 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4012 HaveSmallAccum =
true;
4013 }
else if (LocalAccum[1]) {
4014 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4015 HaveSmallAccum =
false;
4017 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4018 HaveSmallAccum =
true;
4021 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4023 HaveSmallAccum =
true;
4027 unsigned j1 = DstIndex - j0;
4028 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4032 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4033 {Src0[j0], Src1[j1], Tmp});
4034 Tmp = Mad.getReg(0);
4035 if (!HaveSmallAccum)
4036 CarryOut.push_back(Mad.getReg(1));
4037 HaveSmallAccum =
false;
4040 }
while (j0 <= DstIndex);
4042 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4043 LocalAccum[0] = Unmerge.getReg(0);
4044 if (LocalAccum.size() > 1)
4045 LocalAccum[1] = Unmerge.getReg(1);
4072 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4073 Carry OddCarryIn = std::move(OddCarry);
4074 Carry EvenCarryIn = std::move(EvenCarry);
4079 if (2 * i < Accum.
size()) {
4080 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4081 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4086 if (!SeparateOddAlignedProducts) {
4087 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4088 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4090 bool IsHighest = 2 * i >= Accum.
size();
4094 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4100 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4102 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4104 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4107 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4110 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4111 Lo->getOperand(1).getReg());
4112 Accum[2 * i] =
Hi.getReg(0);
4113 SeparateOddCarry =
Hi.getReg(1);
4120 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4121 EvenCarryIn.push_back(CarryOut);
4123 if (2 * i < Accum.
size()) {
4124 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4125 OddCarry.push_back(CarryOut);
4138 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4147 LLT Ty =
MRI.getType(DstReg);
4151 unsigned NumParts =
Size / 32;
4167 for (
unsigned i = 0; i < NumParts; ++i) {
4171 B.buildUnmerge(Src0Parts, Src0);
4172 B.buildUnmerge(Src1Parts, Src1);
4175 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4176 SeparateOddAlignedProducts);
4178 B.buildMergeLikeInstr(DstReg, AccumRegs);
4179 MI.eraseFromParent();
4191 LLT DstTy =
MRI.getType(Dst);
4192 LLT SrcTy =
MRI.getType(Src);
4194 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4195 ? AMDGPU::G_AMDGPU_FFBH_U32
4196 : AMDGPU::G_AMDGPU_FFBL_B32;
4197 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4200 MI.eraseFromParent();
4209 LLT SrcTy =
MRI.getType(Src);
4214 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4215 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4216 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4217 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4218 B.buildTrunc(Dst, Ctlz);
4219 MI.eraseFromParent();
4225 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4228 return ConstVal && *ConstVal == -1;
4235 Register CondDef =
MI.getOperand(0).getReg();
4236 if (!
MRI.hasOneNonDBGUse(CondDef))
4244 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4250 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4259 if (Next == Parent->
end()) {
4263 UncondBrTarget = &*NextMBB;
4265 if (Next->getOpcode() != AMDGPU::G_BR)
4283 *ArgRC,
B.getDebugLoc(), ArgTy);
4287 const unsigned Mask = Arg->
getMask();
4288 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4295 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4296 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4299 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4301 B.buildCopy(DstReg, LiveIn);
4330 Arg = &WorkGroupIDX;
4331 ArgRC = &AMDGPU::SReg_32RegClass;
4335 Arg = &WorkGroupIDY;
4336 ArgRC = &AMDGPU::SReg_32RegClass;
4340 Arg = &WorkGroupIDZ;
4341 ArgRC = &AMDGPU::SReg_32RegClass;
4356 B.buildConstant(DstReg, 0);
4362 B.buildUndef(DstReg);
4366 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4377 MI.eraseFromParent();
4383 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4384 MI.eraseFromParent();
4405 B.buildUndef(DstReg);
4406 MI.eraseFromParent();
4410 if (Arg->isMasked()) {
4424 MI.eraseFromParent();
4431 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4441 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4449 Align Alignment)
const {
4453 "unexpected kernarg parameter type");
4457 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
4460 MI.eraseFromParent();
4468 LLT DstTy =
MRI.getType(Dst);
4495 auto FloatY =
B.buildUITOFP(
S32,
Y);
4496 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
4497 auto Scale =
B.buildFConstant(
S32, llvm::bit_cast<float>(0x4f7ffffe));
4498 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
4499 auto Z =
B.buildFPTOUI(
S32, ScaledY);
4502 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
4503 auto NegYZ =
B.buildMul(
S32, NegY, Z);
4504 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
4507 auto Q =
B.buildUMulH(
S32,
X, Z);
4508 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
4511 auto One =
B.buildConstant(
S32, 1);
4514 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
4520 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
4523 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
4542 auto Unmerge =
B.buildUnmerge(
S32, Val);
4544 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
4545 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
4547 auto Mad =
B.buildFMAD(
4549 B.buildFConstant(
S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4551 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
4552 auto Mul1 =
B.buildFMul(
4553 S32, Rcp,
B.buildFConstant(
S32, llvm::bit_cast<float>(0x5f7ffffc)));
4556 auto Mul2 =
B.buildFMul(
4557 S32, Mul1,
B.buildFConstant(
S32, llvm::bit_cast<float>(0x2f800000)));
4558 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
4561 auto Mad2 =
B.buildFMAD(
4562 S32, Trunc,
B.buildFConstant(
S32, llvm::bit_cast<float>(0xcf800000)),
4565 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
4566 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
4568 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4583 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
4585 auto Zero64 =
B.buildConstant(
S64, 0);
4586 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
4588 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
4589 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
4591 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
4592 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4593 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4595 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
4596 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4597 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
4599 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
4600 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
4601 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
4602 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4603 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4605 auto Zero32 =
B.buildConstant(
S32, 0);
4606 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
4607 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4608 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
4610 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
4611 Register NumerLo = UnmergeNumer.getReg(0);
4612 Register NumerHi = UnmergeNumer.getReg(1);
4614 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
4615 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
4616 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
4617 Register Mul3_Lo = UnmergeMul3.getReg(0);
4618 Register Mul3_Hi = UnmergeMul3.getReg(1);
4619 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
4620 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4621 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
4622 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
4624 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
4625 Register DenomLo = UnmergeDenom.getReg(0);
4626 Register DenomHi = UnmergeDenom.getReg(1);
4629 auto C1 =
B.buildSExt(
S32, CmpHi);
4632 auto C2 =
B.buildSExt(
S32, CmpLo);
4635 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
4642 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
4643 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4644 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4645 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
4647 auto One64 =
B.buildConstant(
S64, 1);
4648 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
4654 auto C6 =
B.buildSelect(
4658 auto Add4 =
B.buildAdd(
S64, Add3, One64);
4659 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
4661 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4662 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4663 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
4669 auto Sel1 =
B.buildSelect(
4676 auto Sel2 =
B.buildSelect(
4687 switch (
MI.getOpcode()) {
4690 case AMDGPU::G_UDIV: {
4691 DstDivReg =
MI.getOperand(0).getReg();
4694 case AMDGPU::G_UREM: {
4695 DstRemReg =
MI.getOperand(0).getReg();
4698 case AMDGPU::G_UDIVREM: {
4699 DstDivReg =
MI.getOperand(0).getReg();
4700 DstRemReg =
MI.getOperand(1).getReg();
4707 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4708 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
4709 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
4710 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4719 MI.eraseFromParent();
4729 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4730 if (Ty !=
S32 && Ty !=
S64)
4733 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4738 auto LHSign =
B.buildAShr(Ty,
LHS, SignBitOffset);
4739 auto RHSign =
B.buildAShr(Ty,
RHS, SignBitOffset);
4741 LHS =
B.buildAdd(Ty,
LHS, LHSign).getReg(0);
4742 RHS =
B.buildAdd(Ty,
RHS, RHSign).getReg(0);
4744 LHS =
B.buildXor(Ty,
LHS, LHSign).getReg(0);
4745 RHS =
B.buildXor(Ty,
RHS, RHSign).getReg(0);
4747 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4748 switch (
MI.getOpcode()) {
4751 case AMDGPU::G_SDIV: {
4752 DstDivReg =
MI.getOperand(0).getReg();
4753 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4756 case AMDGPU::G_SREM: {
4757 DstRemReg =
MI.getOperand(0).getReg();
4758 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4761 case AMDGPU::G_SDIVREM: {
4762 DstDivReg =
MI.getOperand(0).getReg();
4763 DstRemReg =
MI.getOperand(1).getReg();
4764 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4765 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4776 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
4777 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4778 B.buildSub(DstDivReg, SignXor, Sign);
4782 auto Sign = LHSign.getReg(0);
4783 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4784 B.buildSub(DstRemReg, SignXor, Sign);
4787 MI.eraseFromParent();
4798 LLT ResTy =
MRI.getType(Res);
4805 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
4816 if (CLHS->isExactlyValue(1.0)) {
4817 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4821 MI.eraseFromParent();
4826 if (CLHS->isExactlyValue(-1.0)) {
4827 auto FNeg =
B.buildFNeg(ResTy,
RHS, Flags);
4828 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4829 .addUse(FNeg.getReg(0))
4832 MI.eraseFromParent();
4839 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
4844 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4847 B.buildFMul(Res,
LHS, RCP, Flags);
4849 MI.eraseFromParent();
4860 LLT ResTy =
MRI.getType(Res);
4866 if (!AllowInaccurateRcp)
4869 auto NegY =
B.buildFNeg(ResTy,
Y);
4870 auto One =
B.buildFConstant(ResTy, 1.0);
4872 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4876 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
4877 R =
B.buildFMA(ResTy, Tmp0, R, R);
4879 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
4880 R =
B.buildFMA(ResTy, Tmp1, R, R);
4882 auto Ret =
B.buildFMul(ResTy,
X, R);
4883 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
4885 B.buildFMA(Res, Tmp2, R, Ret);
4886 MI.eraseFromParent();
4905 auto LHSExt =
B.buildFPExt(
S32,
LHS, Flags);
4906 auto RHSExt =
B.buildFPExt(
S32,
RHS, Flags);
4908 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
4909 .addUse(RHSExt.getReg(0))
4912 auto QUOT =
B.buildFMul(
S32, LHSExt, RCP, Flags);
4913 auto RDst =
B.buildFPTrunc(
S16, QUOT, Flags);
4915 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4916 .addUse(RDst.getReg(0))
4921 MI.eraseFromParent();
4934 unsigned SPDenormMode =
4937 if (ST.hasDenormModeInst()) {
4939 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4941 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4942 B.buildInstr(AMDGPU::S_DENORM_MODE)
4943 .addImm(NewDenormModeValue);
4946 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4947 .addImm(SPDenormMode)
4969 auto One =
B.buildFConstant(
S32, 1.0f);
4971 auto DenominatorScaled =
4972 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
4977 auto NumeratorScaled =
4978 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
4984 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
4985 .addUse(DenominatorScaled.getReg(0))
4987 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
4990 const bool HasDynamicDenormals =
4995 if (!PreservesDenormals) {
4996 if (HasDynamicDenormals) {
4997 SavedSPDenormMode =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4998 B.buildInstr(AMDGPU::S_GETREG_B32)
4999 .addDef(SavedSPDenormMode)
5005 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5006 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5007 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5008 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5009 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5010 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5012 if (!PreservesDenormals) {
5013 if (HasDynamicDenormals) {
5014 assert(SavedSPDenormMode);
5015 B.buildInstr(AMDGPU::S_SETREG_B32)
5016 .addReg(SavedSPDenormMode)
5022 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5023 .addUse(Fma4.getReg(0))
5024 .addUse(Fma1.getReg(0))
5025 .addUse(Fma3.getReg(0))
5026 .addUse(NumeratorScaled.getReg(1))
5029 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5030 .addUse(Fmas.getReg(0))
5035 MI.eraseFromParent();
5054 auto One =
B.buildFConstant(
S64, 1.0);
5056 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5062 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5064 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5065 .addUse(DivScale0.getReg(0))
5068 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5069 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5070 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5072 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5078 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5079 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5080 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5089 auto NumUnmerge =
B.buildUnmerge(
S32,
LHS);
5090 auto DenUnmerge =
B.buildUnmerge(
S32,
RHS);
5091 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5092 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5095 Scale1Unmerge.getReg(1));
5097 Scale0Unmerge.getReg(1));
5098 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5100 Scale = DivScale1.getReg(1);
5103 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5104 .addUse(Fma4.getReg(0))
5105 .addUse(Fma3.getReg(0))
5106 .addUse(
Mul.getReg(0))
5110 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5111 .addUse(Fmas.getReg(0))
5116 MI.eraseFromParent();
5128 LLT Ty =
MRI.getType(Res0);
5131 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5134 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5139 auto Fabs =
B.buildFAbs(Ty, Val);
5143 auto Zero =
B.buildConstant(InstrExpTy, 0);
5144 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5145 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5148 B.buildCopy(Res0, Mant);
5149 B.buildSExtOrTrunc(Res1, Exp);
5151 MI.eraseFromParent();
5166 auto Abs =
B.buildFAbs(
S32,
RHS, Flags);
5169 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5170 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5171 auto C2 =
B.buildFConstant(
S32, 1.0f);
5174 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5176 auto Mul0 =
B.buildFMul(
S32,
RHS, Sel, Flags);
5178 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5179 .addUse(Mul0.getReg(0))
5182 auto Mul1 =
B.buildFMul(
S32,
LHS, RCP, Flags);
5184 B.buildFMul(Res, Sel, Mul1, Flags);
5186 MI.eraseFromParent();
5195 unsigned Flags =
MI.getFlags();
5198 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5199 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5200 .addUse(Ext.getReg(0))
5202 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5203 MI.eraseFromParent();
5213 const unsigned Flags =
MI.getFlags();
5222 MI.eraseFromParent();
5226 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5228 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5229 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5230 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5235 .addUse(SqrtX.getReg(0))
5238 auto NegOne =
B.buildConstant(I32, -1);
5239 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5241 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5242 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5244 auto PosOne =
B.buildConstant(I32, 1);
5245 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5247 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5248 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5250 auto Zero =
B.buildFConstant(
F32, 0.0f);
5254 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5258 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5261 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5262 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5264 auto Half =
B.buildFConstant(
F32, 0.5f);
5265 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5266 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5267 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5268 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5269 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5270 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5271 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5272 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5275 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5277 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5279 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5282 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5284 MI.eraseFromParent();
5316 assert(
MRI.getType(Dst) ==
F64 &&
"only expect to lower f64 sqrt");
5319 unsigned Flags =
MI.getFlags();
5321 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
5323 auto ZeroInt =
B.buildConstant(
S32, 0);
5327 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
5328 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
5329 auto SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags);
5332 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX.getReg(0));
5334 auto Half =
B.buildFConstant(
F64, 0.5);
5335 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
5336 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
5338 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
5339 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
5341 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
5342 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
5344 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
5345 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
5347 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
5349 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
5350 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
5352 auto SqrtRet =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
5355 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
5356 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
5357 SqrtRet =
B.buildFLdexp(
F64, SqrtRet, ScaleDown, Flags);
5366 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5368 MI.eraseFromParent();
5375 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5399 auto Flags =
MI.getFlags();
5401 LLT Ty =
MRI.getType(Dst);
5411 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5421 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5422 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5427 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5429 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5430 MI.eraseFromParent();
5442 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5443 IID == Intrinsic::amdgcn_permlanex16;
5447 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
5449 case Intrinsic::amdgcn_readfirstlane:
5450 case Intrinsic::amdgcn_permlane64:
5451 return LaneOp.getReg(0);
5452 case Intrinsic::amdgcn_readlane:
5453 return LaneOp.addUse(Src1).getReg(0);
5454 case Intrinsic::amdgcn_writelane:
5455 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5456 case Intrinsic::amdgcn_permlane16:
5457 case Intrinsic::amdgcn_permlanex16: {
5461 return LaneOp.addUse(Src1)
5476 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5478 Src1 =
MI.getOperand(3).getReg();
5479 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5480 Src2 =
MI.getOperand(4).getReg();
5484 LLT Ty =
MRI.getType(DstReg);
5493 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
5498 if (IID == Intrinsic::amdgcn_writelane)
5501 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
5502 B.buildTrunc(DstReg, LaneOpDst);
5503 MI.eraseFromParent();
5518 PartialResTy = EltTy;
5527 unsigned NumParts =
Size / 32;
5532 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
5534 if (IID == Intrinsic::amdgcn_writelane)
5535 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
5537 for (
unsigned i = 0; i < NumParts; ++i) {
5538 Src0 = Src0Parts.
getReg(i);
5541 Src1 = Src1Parts.
getReg(i);
5543 if (IID == Intrinsic::amdgcn_writelane)
5544 Src2 = Src2Parts.
getReg(i);
5546 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5549 B.buildMergeLikeInstr(DstReg, PartialRes);
5550 MI.eraseFromParent();
5560 LLT DstTy =
MRI.getType(DstReg);
5563 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
5569 B.buildPtrAdd(DstReg, KernargPtrReg,
B.buildConstant(IdxTy,
Offset).getReg(0));
5580 Register Pointer =
MI.getOperand(2).getReg();
5582 Register NumRecords =
MI.getOperand(4).getReg();
5587 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
5588 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
5589 Register LowHalf = Unmerge.getReg(0);
5590 Register HighHalf = Unmerge.getReg(1);
5592 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
5593 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
5596 std::optional<ValueAndVReg> StrideConst =
5598 if (!StrideConst || !StrideConst->Value.isZero()) {
5601 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5602 uint32_t ShiftedStrideVal = StrideVal << 16;
5603 ShiftedStride =
B.buildConstant(
S32, ShiftedStrideVal);
5605 auto ExtStride =
B.buildAnyExt(
S32, Stride);
5606 auto ShiftConst =
B.buildConstant(
S32, 16);
5607 ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
5609 NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
5612 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5613 MI.eraseFromParent();
5630 MI.eraseFromParent();
5638 std::optional<uint32_t> KnownSize =
5640 if (KnownSize.has_value())
5641 B.buildConstant(DstReg, *KnownSize);
5659 MI.eraseFromParent();
5666 unsigned AddrSpace)
const {
5668 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32),
MI.getOperand(2).getReg());
5672 MI.eraseFromParent();
5682std::pair<Register, unsigned>
5691 std::tie(BaseReg, ImmOffset) =
5695 if (
MRI.getType(BaseReg).isPointer())
5696 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
5706 unsigned Overflow = ImmOffset & ~MaxImm;
5707 ImmOffset -= Overflow;
5708 if ((int32_t)Overflow < 0) {
5709 Overflow += ImmOffset;
5713 if (Overflow != 0) {
5715 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
5717 auto OverflowVal =
B.buildConstant(
S32, Overflow);
5718 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
5723 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
5725 return std::pair(BaseReg, ImmOffset);
5732 bool ImageStore)
const {
5735 LLT StoreVT =
MRI.getType(Reg);
5739 auto Unmerge =
B.buildUnmerge(
S16, Reg);
5742 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5743 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
5754 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
5756 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
5763 auto Unmerge =
B.buildUnmerge(
S16, Reg);
5764 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5766 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
5774 auto Unmerge =
B.buildUnmerge(
S32, Reg);
5775 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5777 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
5795 LLT Ty =
MRI->getType(VData);
5823 bool IsFormat)
const {
5825 LLT Ty =
MRI.getType(VData);
5827 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
5840 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5843 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
5847 VIndex =
MI.getOperand(3).getReg();
5850 VIndex =
B.buildConstant(
S32, 0).getReg(0);
5853 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
5854 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
5858 Format =
MI.getOperand(5 + OpOffset).getImm();
5862 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
5868 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5869 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5870 }
else if (IsFormat) {
5871 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5872 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5876 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5879 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5882 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5887 auto MIB =
B.buildInstr(Opc)
5898 MIB.addImm(AuxiliaryData)
5899 .addImm(HasVIndex ? -1 : 0)
5900 .addMemOperand(MMO);
5902 MI.eraseFromParent();
5908 unsigned ImmOffset,
unsigned Format,
5911 auto MIB =
B.buildInstr(Opc)
5922 MIB.addImm(AuxiliaryData)
5923 .addImm(HasVIndex ? -1 : 0)
5924 .addMemOperand(MMO);
5931 bool IsTyped)
const {
5941 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
5942 bool IsTFE =
MI.getNumExplicitDefs() == 2;
5944 StatusDst =
MI.getOperand(1).getReg();
5949 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
5952 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5955 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
5958 VIndex =
MI.getOperand(3 + OpOffset).getReg();
5961 VIndex =
B.buildConstant(
S32, 0).getReg(0);
5964 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
5965 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
5969 Format =
MI.getOperand(5 + OpOffset).getImm();
5973 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
5976 LLT Ty =
MRI.getType(Dst);
5981 Dst =
MI.getOperand(0).getReg();
5984 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
5995 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5996 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5997 }
else if (IsFormat) {
6001 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6003 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6004 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6009 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6010 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6013 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6014 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6017 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6018 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6025 unsigned NumLoadDWords = NumValueDWords + 1;
6027 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6028 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6029 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6031 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6032 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6033 B.buildTrunc(Dst, ExtDst);
6034 }
else if (NumValueDWords == 1) {
6035 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6038 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6039 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6041 B.buildUnmerge(LoadElts, LoadDstReg);
6043 B.buildMergeLikeInstr(Dst, LoadElts);
6047 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6048 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6049 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6050 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6051 B.buildTrunc(Dst, LoadDstReg);
6052 }
else if (Unpacked && IsD16 && Ty.
isVector()) {
6054 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6055 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6056 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6057 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6059 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6061 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6062 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6063 B.buildMergeLikeInstr(Dst, Repack);
6066 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6069 MI.eraseFromParent();
6075 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6076 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6077 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6078 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6079 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6080 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6081 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6082 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6084 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6085 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6086 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6087 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6088 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6089 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6090 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6091 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6092 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6093 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6094 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6095 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6096 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6097 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6098 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6100 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6101 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6102 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6103 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6104 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6105 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6106 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6107 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6108 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6109 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6110 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6111 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6112 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6113 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6114 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6115 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6116 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6117 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6118 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6119 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6120 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6121 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6122 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6123 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6124 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6125 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6126 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6127 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6128 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6130 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6131 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6132 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6133 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6134 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6135 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6136 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6137 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6138 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6139 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6140 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6141 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6142 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6143 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6144 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6145 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6146 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6147 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6148 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6149 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6150 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6151 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6152 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6153 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6154 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6155 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6156 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6157 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6166 const bool IsCmpSwap =
6167 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6168 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6169 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6170 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6181 CmpVal =
MI.getOperand(3).getReg();
6186 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6187 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6190 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6193 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6196 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6199 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6200 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6201 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6220 .addImm(AuxiliaryData)
6221 .addImm(HasVIndex ? -1 : 0)
6222 .addMemOperand(MMO);
6224 MI.eraseFromParent();
6234 bool IsA16,
bool IsG16) {
6237 auto EndIdx =
Intr->VAddrEnd;
6239 for (
unsigned I =
Intr->VAddrStart;
I < EndIdx;
I++) {
6246 if ((I < Intr->GradientStart) ||
6247 (
I >=
Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6248 (
I >=
Intr->CoordStart && !IsA16)) {
6249 if ((I < Intr->GradientStart) && IsA16 &&
6250 (
B.getMRI()->getType(AddrReg) ==
S16)) {
6251 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
6255 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6259 "Bias needs to be converted to 16 bit in A16 mode");
6261 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
6267 if (((
I + 1) >= EndIdx) ||
6268 ((
Intr->NumGradients / 2) % 2 == 1 &&
6269 (
I ==
static_cast<unsigned>(
Intr->GradientStart +
6270 (
Intr->NumGradients / 2) - 1) ||
6271 I ==
static_cast<unsigned>(
Intr->GradientStart +
6272 Intr->NumGradients - 1))) ||
6274 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
6276 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6281 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6292 int DimIdx,
int NumVAddrs) {
6296 for (
int I = 0;
I != NumVAddrs; ++
I) {
6298 if (
SrcOp.isReg()) {
6304 int NumAddrRegs = AddrRegs.
size();
6305 if (NumAddrRegs != 1) {
6308 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6311 for (
int I = 1;
I != NumVAddrs; ++
I) {
6314 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
6336 const unsigned NumDefs =
MI.getNumExplicitDefs();
6337 const unsigned ArgOffset = NumDefs + 1;
6338 bool IsTFE = NumDefs == 2;
6356 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6357 Ty =
MRI->getType(VData);
6360 const bool IsAtomicPacked16Bit =
6361 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6362 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6366 MRI->getType(
MI.getOperand(ArgOffset +
Intr->GradientStart).getReg());
6368 MRI->getType(
MI.getOperand(ArgOffset +
Intr->CoordStart).getReg());
6371 const bool IsA16 = AddrTy ==
S16;
6375 if (!BaseOpcode->
Atomic) {
6376 DMask =
MI.getOperand(ArgOffset +
Intr->DMaskIndex).getImm();
6379 }
else if (DMask != 0) {
6381 }
else if (!IsTFE && !BaseOpcode->
Store) {
6383 B.buildUndef(
MI.getOperand(0));
6384 MI.eraseFromParent();
6392 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6393 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6394 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6395 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6396 unsigned NewOpcode = LoadOpcode;
6397 if (BaseOpcode->
Store)
6398 NewOpcode = StoreOpcode;
6400 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6403 MI.setDesc(
B.getTII().get(NewOpcode));
6407 if (IsTFE && DMask == 0) {
6410 MI.getOperand(ArgOffset +
Intr->DMaskIndex).setImm(DMask);
6413 if (BaseOpcode->
Atomic) {
6415 LLT Ty =
MRI->getType(VData0);
6418 if (Ty.
isVector() && !IsAtomicPacked16Bit)
6425 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
6426 MI.getOperand(2).setReg(
Concat.getReg(0));
6427 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6431 unsigned CorrectedNumVAddrs =
Intr->NumVAddrs;
6440 if (IsA16 && !ST.
hasA16()) {
6448 if (IsA16 || IsG16) {
6458 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
6459 const bool UsePartialNSA =
6460 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
6462 if (UsePartialNSA) {
6466 auto Concat =
B.buildConcatVectors(
6467 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6468 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
6469 PackedRegs.
resize(NSAMaxSize);
6470 }
else if (!UseNSA && PackedRegs.
size() > 1) {
6472 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
6473 PackedRegs[0] =
Concat.getReg(0);
6477 const unsigned NumPacked = PackedRegs.
size();
6478 for (
unsigned I =
Intr->VAddrStart; I < Intr->VAddrEnd;
I++) {
6480 if (!
SrcOp.isReg()) {
6487 if (
I -
Intr->VAddrStart < NumPacked)
6488 SrcOp.setReg(PackedRegs[
I -
Intr->VAddrStart]);
6490 SrcOp.setReg(AMDGPU::NoRegister);
6509 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6510 const bool UsePartialNSA =
6511 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6513 if (UsePartialNSA) {
6515 ArgOffset +
Intr->VAddrStart + NSAMaxSize - 1,
6516 Intr->NumVAddrs - NSAMaxSize + 1);
6517 }
else if (!UseNSA &&
Intr->NumVAddrs > 1) {
6536 if (RepackedReg != VData) {
6537 MI.getOperand(1).setReg(RepackedReg);
6548 if (NumElts < DMaskLanes)
6551 if (NumElts > 4 || DMaskLanes > 4)
6561 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6562 const LLT AdjustedTy =
6585 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
6586 unsigned RoundedSize = 32 * RoundedElts;
6590 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
6595 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
6601 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
6605 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6606 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
6608 Register NewResultReg =
MRI->createGenericVirtualRegister(LoadResultTy);
6610 MI.getOperand(0).setReg(NewResultReg);
6618 Dst1Reg =
MI.getOperand(1).getReg();
6619 if (
MRI->getType(Dst1Reg) !=
S32)
6623 MI.removeOperand(1);
6627 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6636 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6638 if (ResultNumRegs == 1) {
6640 ResultRegs[0] = NewResultReg;
6643 for (
int I = 0;
I != NumDataRegs; ++
I)
6644 ResultRegs[
I] =
MRI->createGenericVirtualRegister(RegTy);
6645 B.buildUnmerge(ResultRegs, NewResultReg);
6650 ResultRegs.
resize(NumDataRegs);
6656 B.buildTrunc(DstReg, ResultRegs[0]);
6662 B.buildBitcast(DstReg, ResultRegs[0]);
6676 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
6679 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
6683 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
6686 Register Undef =
B.buildUndef(Ty).getReg(0);
6687 for (
int I = 0;
I != NumElts; ++
I)
6692 LLT ResTy =
MRI->getType(ResultRegs[0]);
6694 padWithUndef(ResTy, NumElts - ResultRegs.
size());
6695 B.buildBuildVector(DstReg, ResultRegs);
6706 if (ResultRegs.
size() == 1) {
6707 NewResultReg = ResultRegs[0];
6708 }
else if (ResultRegs.
size() == 2) {
6710 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
6716 if (
MRI->getType(DstReg).getNumElements() <
6717 MRI->getType(NewResultReg).getNumElements()) {
6718 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6720 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6725 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
6726 B.buildConcatVectors(DstReg, ResultRegs);
6735 Register OrigDst =
MI.getOperand(0).getReg();
6737 LLT Ty =
B.getMRI()->getType(OrigDst);
6743 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6744 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6747 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
6749 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6758 B.setInsertPt(
B.getMBB(),
MI);
6763 B.setInsertPt(
B.getMBB(),
MI);
6769 MI.setDesc(
B.getTII().get(Opc));
6770 MI.removeOperand(1);
6773 const unsigned MemSize = (
Size + 7) / 8;
6774 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
6781 MI.addMemOperand(MF, MMO);
6782 if (Dst != OrigDst) {
6783 MI.getOperand(0).setReg(Dst);
6784 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6785 B.buildTrunc(OrigDst, Dst);
6823 MI.eraseFromParent();
6833 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
6835 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6839 MI.eraseFromParent();
6848 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6857 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
6873 Register LoadAddr =
MRI.createGenericVirtualRegister(
6875 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6878 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
6879 B.buildCopy(SGPR01, Temp);
6880 B.buildInstr(AMDGPU::S_TRAP)
6883 MI.eraseFromParent();
6894 B.buildCopy(SGPR01, LiveIn);
6895 B.buildInstr(AMDGPU::S_TRAP)
6899 MI.eraseFromParent();
6911 MI.eraseFromParent();
6915 B.buildInstr(AMDGPU::S_TRAP)
6917 MI.eraseFromParent();
6929 "debugtrap handler not supported",
6931 LLVMContext &Ctx =
B.getMF().getFunction().getContext();
6935 B.buildInstr(AMDGPU::S_TRAP)
6939 MI.eraseFromParent();
6952 Register NodePtr =
MI.getOperand(2).getReg();
6953 Register RayExtent =
MI.getOperand(3).getReg();
6954 Register RayOrigin =
MI.getOperand(4).getReg();
6956 Register RayInvDir =
MI.getOperand(6).getReg();
6961 "intrinsic not supported on subtarget",
6963 B.getMF().getFunction().getContext().diagnose(BadIntrin);
6970 const bool IsA16 =
MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6971 const bool Is64 =
MRI.getType(NodePtr).getSizeInBits() == 64;
6972 const unsigned NumVDataDwords = 4;
6973 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6974 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6978 const unsigned BaseOpcodes[2][2] = {
6979 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6980 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6981 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6985 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6986 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6987 : AMDGPU::MIMGEncGfx10NSA,
6988 NumVDataDwords, NumVAddrDwords);
6992 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6993 : AMDGPU::MIMGEncGfx10Default,
6994 NumVDataDwords, NumVAddrDwords);
6999 if (UseNSA && IsGFX11Plus) {
7001 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7002 auto Merged =
B.buildMergeLikeInstr(
7003 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7009 packLanes(RayOrigin);
7012 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7013 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7014 auto MergedDir =
B.buildMergeLikeInstr(
7017 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7018 UnmergeRayDir.getReg(0)}))
7021 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7022 UnmergeRayDir.getReg(1)}))
7025 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7026 UnmergeRayDir.getReg(2)}))
7031 packLanes(RayInvDir);
7035 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7044 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7050 packLanes(RayOrigin);
7052 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7053 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7057 B.buildMergeLikeInstr(R1,
7058 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7059 B.buildMergeLikeInstr(
7060 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7061 B.buildMergeLikeInstr(
7062 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7068 packLanes(RayInvDir);
7075 Register MergedOps =
B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7080 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7089 .addImm(IsA16 ? 1 : 0)
7092 MI.eraseFromParent();
7104 int RoundMode =
MI.getOperand(2).getImm();
7115 unsigned HW_Mode = (RoundMode + 3) % 4;
7116 B.buildInstr(AMDGPU::G_FPTRUNC_ROUND)
7117 .addDef(
MI.getOperand(0).getReg())
7121 MI.eraseFromParent();
7130 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7131 MI.eraseFromParent();
7142 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7143 auto LSB =
B.buildConstant(
S32, 25);
7144 auto Width =
B.buildConstant(
S32, 5);
7145 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7146 MI.eraseFromParent();
7160 if (
MRI.getType(Src) !=
S64)
7164 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7168 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7171 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7172 MI.eraseFromParent();
7180 if (
MRI.getType(Src) !=
S64)
7183 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
7187 .addReg(Unmerge.getReg(0));
7191 .addReg(Unmerge.getReg(1));
7192 MI.eraseFromParent();
7202 auto IntrID = cast<GIntrinsic>(
MI).getIntrinsicID();
7204 case Intrinsic::amdgcn_if:
7205 case Intrinsic::amdgcn_else: {
7208 bool Negated =
false;
7220 std::swap(CondBrTarget, UncondBrTarget);
7222 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7223 if (IntrID == Intrinsic::amdgcn_if) {
7224 B.buildInstr(AMDGPU::SI_IF)
7227 .addMBB(UncondBrTarget);
7229 B.buildInstr(AMDGPU::SI_ELSE)
7232 .addMBB(UncondBrTarget);
7241 B.buildBr(*CondBrTarget);
7244 MRI.setRegClass(Def,
TRI->getWaveMaskRegClass());
7245 MRI.setRegClass(
Use,
TRI->getWaveMaskRegClass());
7246 MI.eraseFromParent();
7247 BrCond->eraseFromParent();
7253 case Intrinsic::amdgcn_loop: {
7256 bool Negated =
false;
7266 std::swap(CondBrTarget, UncondBrTarget);
7268 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7269 B.buildInstr(AMDGPU::SI_LOOP)
7271 .addMBB(UncondBrTarget);
7276 B.buildBr(*CondBrTarget);
7278 MI.eraseFromParent();
7279 BrCond->eraseFromParent();
7280 MRI.setRegClass(Reg,
TRI->getWaveMaskRegClass());
7286 case Intrinsic::amdgcn_addrspacecast_nonnull:
7288 case Intrinsic::amdgcn_make_buffer_rsrc:
7290 case Intrinsic::amdgcn_kernarg_segment_ptr:
7293 B.buildConstant(
MI.getOperand(0).getReg(), 0);
7294 MI.eraseFromParent();
7300 case Intrinsic::amdgcn_implicitarg_ptr:
7302 case Intrinsic::amdgcn_workitem_id_x:
7305 case Intrinsic::amdgcn_workitem_id_y:
7308 case Intrinsic::amdgcn_workitem_id_z:
7311 case Intrinsic::amdgcn_workgroup_id_x:
7314 case Intrinsic::amdgcn_workgroup_id_y:
7317 case Intrinsic::amdgcn_workgroup_id_z:
7320 case Intrinsic::amdgcn_wave_id:
7322 case Intrinsic::amdgcn_lds_kernel_id:
7325 case Intrinsic::amdgcn_dispatch_ptr:
7328 case Intrinsic::amdgcn_queue_ptr:
7331 case Intrinsic::amdgcn_implicit_buffer_ptr:
7334 case Intrinsic::amdgcn_dispatch_id:
7337 case Intrinsic::r600_read_ngroups_x:
7341 case Intrinsic::r600_read_ngroups_y:
7344 case Intrinsic::r600_read_ngroups_z:
7347 case Intrinsic::r600_read_local_size_x:
7350 case Intrinsic::r600_read_local_size_y:
7354 case Intrinsic::r600_read_local_size_z:
7356 case Intrinsic::r600_read_global_size_x:
7358 case Intrinsic::r600_read_global_size_y:
7360 case Intrinsic::r600_read_global_size_z:
7362 case Intrinsic::amdgcn_fdiv_fast:
7364 case Intrinsic::amdgcn_is_shared:
7366 case Intrinsic::amdgcn_is_private:
7368 case Intrinsic::amdgcn_wavefrontsize: {
7370 MI.eraseFromParent();
7373 case Intrinsic::amdgcn_s_buffer_load:
7375 case Intrinsic::amdgcn_raw_buffer_store:
7376 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7377 case Intrinsic::amdgcn_struct_buffer_store:
7378 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7380 case Intrinsic::amdgcn_raw_buffer_store_format:
7381 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7382 case Intrinsic::amdgcn_struct_buffer_store_format:
7383 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7385 case Intrinsic::amdgcn_raw_tbuffer_store:
7386 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7387 case Intrinsic::amdgcn_struct_tbuffer_store:
7388 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7390 case Intrinsic::amdgcn_raw_buffer_load:
7391 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7392 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7393 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7394 case Intrinsic::amdgcn_struct_buffer_load:
7395 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7396 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7397 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7399 case Intrinsic::amdgcn_raw_buffer_load_format:
7400 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7401 case Intrinsic::amdgcn_struct_buffer_load_format:
7402 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7404 case Intrinsic::amdgcn_raw_tbuffer_load:
7405 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7406 case Intrinsic::amdgcn_struct_tbuffer_load:
7407 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7409 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7410 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7411 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7412 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7413 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7414 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7415 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7416 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7417 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7418 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7419 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7420 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7421 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7422 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7423 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7424 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7425 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7426 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7427 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7428 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7429 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7430 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7431 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7432 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7433 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7434 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7435 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7436 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7437 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7438 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7439 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7440 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7441 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7442 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7443 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7444 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7445 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7446 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7447 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7448 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7449 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7450 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7451 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7452 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7453 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7454 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7455 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7456 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7457 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7458 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7459 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7460 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7461 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7463 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7464 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7465 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7466 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7467 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7468 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7469 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7470 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7471 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7472 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7474 case Intrinsic::amdgcn_rsq_clamp:
7476 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7478 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7479 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7480 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7481 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7482 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7483 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7484 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7485 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7489 MI.getOperand(5).setReg(
B.buildAnyExt(
S32,
Index).getReg(0));
7492 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7493 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7494 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7498 MI.getOperand(7).setReg(
B.buildAnyExt(
S32,
Index).getReg(0));
7501 case Intrinsic::amdgcn_fmed3: {
7507 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7508 MI.removeOperand(1);
7512 case Intrinsic::amdgcn_readlane:
7513 case Intrinsic::amdgcn_writelane:
7514 case Intrinsic::amdgcn_readfirstlane:
7515 case Intrinsic::amdgcn_permlane16:
7516 case Intrinsic::amdgcn_permlanex16:
7517 case Intrinsic::amdgcn_permlane64:
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterType(LLT Ty)
static bool isRegisterVectorElementType(LLT EltTy)
static bool isRegisterSize(unsigned Size)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static std::initializer_list< LLT > AllS32Vectors
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static std::initializer_list< LLT > AllS16Vectors
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static bool isRegisterClassType(LLT Ty)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx)
static constexpr unsigned MaxRegisterSize
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static const LLT MaxScalar
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
static std::initializer_list< LLT > AllS64Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static std::initializer_list< LLT > AllScalarTypes
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const SmallVectorImpl< MachineOperand > & Cond
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static constexpr int Concat[]
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsTyped, bool IsFormat) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, bool IsFormat) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat, bool IsTyped) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isEntryFunction() const
bool isModuleEntryFunction() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasVOP3PInsts() const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This class represents an Operation in the Expression.
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasArchitectedSGPRs() const
bool hasPrivEnabledTrap2NopBug() const
const SIInstrInfo * getInstrInfo() const override
bool hasScalarSubwordLoads() const
bool supportsGetDoorbellID() const
TrapHandlerAbi getTrapHandlerAbi() const
bool hasGFX10_AEncoding() const
const SITargetLowering * getTargetLowering() const override
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasScalarSMulU64() const
bool hasNSAEncoding() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasScalarDwordx3Loads() const
Generation getGeneration() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasAddNoCarry() const
bool hasPartialNSAEncoding() const
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
KnownBits getKnownBits(Register R)
Simple wrapper observer that takes several observers, and calls each one for each event.
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
constexpr bool isPointerVector() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr ElementCount getElementCount() const
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr bool isPointerOrPointerVector() const
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LegalizeResult lowerFMad(MachineInstr &MI)
GISelKnownBits * getKnownBits() const
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
MutableArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
The instances of the Type class are immutable: once they are created, they are never changed.
A Use represents the edge between a Value definition and its users.
StringRef getName() const
Return a constant reference to the value's name.
constexpr ScalarTy getFixedValue() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
int popcount(T Value) noexcept
Count the number of set bits in a value.
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ Mul
Product of integers.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
std::function< bool(const LegalityQuery &)> LegalityPredicate
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
static const fltSemantics & IEEEsingle() LLVM_READNONE
static const fltSemantics & IEEEdouble() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.