34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
37#define DEBUG_TYPE "amdgpu-legalinfo"
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
47 "amdgpu-global-isel-new-legality",
48 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
74 const LLT Ty = Query.Types[TypeIdx];
81 EltSize > 1 && EltSize < 32 &&
88 const LLT Ty = Query.Types[TypeIdx];
95 const LLT Ty = Query.Types[TypeIdx];
103 const LLT Ty = Query.Types[TypeIdx];
105 return std::pair(TypeIdx,
112 const LLT Ty = Query.Types[TypeIdx];
115 unsigned Pieces = (
Size + 63) / 64;
126 const LLT Ty = Query.Types[TypeIdx];
131 const int NextMul32 = (
Size + 31) / 32;
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
143 const LLT Ty = Query.Types[TypeIdx];
148 assert(EltSize == 32 || EltSize == 64);
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
190 const LLT Ty = Query.Types[TypeIdx];
197 const LLT Ty = Query.Types[TypeIdx];
207 const LLT QueryTy = Query.Types[TypeIdx];
214 const LLT QueryTy = Query.Types[TypeIdx];
221 const LLT QueryTy = Query.Types[TypeIdx];
232 return EltSize == 16 || EltSize % 32 == 0;
237 return EltSize == 32 || EltSize == 64 ||
239 EltSize == 128 || EltSize == 256;
266 LLT Ty = Query.Types[TypeIdx];
274 const LLT QueryTy = Query.Types[TypeIdx];
367 const LLT Ty = Query.Types[TypeIdx];
369 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.
getSizeInBits();
377 bool IsLoad,
bool IsAtomic) {
381 return ST.enableFlatScratch() ? 128 : 32;
383 return ST.useDS128() ? 128 : 64;
394 return IsLoad ? 512 : 128;
399 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
408 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
413 unsigned AS = Query.
Types[1].getAddressSpace();
427 if (IsLoad && MemSize <
Size)
428 MemSize = std::max(MemSize,
Align);
437 AtomicOrdering::NotAtomic))
448 if (!ST.hasDwordx3LoadStores())
461 if (AlignBits < MemSize) {
464 Align(AlignBits / 8)))
507 return EltSize != 32 && EltSize != 64;
522 if (
Size != MemSizeInBits)
538 uint64_t AlignInBits,
unsigned AddrSpace,
548 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
559 if (AlignInBits < RoundedSize)
566 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
573 if (Query.
MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
578 Query.
Types[1].getAddressSpace(), Opcode);
598 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
601 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
602 std::array<Register, 4> VectorElems;
603 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
604 for (
unsigned I = 0;
I < NumParts; ++
I)
606 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
607 B.buildMergeValues(MO, VectorElems);
611 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
612 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
613 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
614 B.buildIntToPtr(MO, Scalar);
634 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
635 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
636 for (
unsigned I = 0;
I < NumParts; ++
I)
638 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
640 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
641 return B.buildBitcast(VectorTy, Scalar).getReg(0);
658 using namespace TargetOpcode;
660 auto GetAddrSpacePtr = [&
TM](
unsigned AS) {
673 const LLT BufferStridedPtr =
676 const LLT CodePtr = FlatPtr;
678 const std::initializer_list<LLT> AddrSpaces64 = {
679 GlobalPtr, ConstantPtr, FlatPtr
682 const std::initializer_list<LLT> AddrSpaces32 = {
683 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
686 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
688 const std::initializer_list<LLT> FPTypesBase = {
692 const std::initializer_list<LLT> FPTypes16 = {
696 const std::initializer_list<LLT> FPTypesPK16 = {
726 .clampMaxNumElementsStrict(0,
S16, 2)
734 .clampMaxNumElementsStrict(0,
S16, 2)
744 .clampMaxNumElementsStrict(0,
S16, 2)
752 .clampMaxNumElementsStrict(0,
S16, 2)
762 .minScalarOrElt(0,
S16)
779 .widenScalarToNextMultipleOf(0, 32)
801 .widenScalarToNextMultipleOf(0, 32)
809 .widenScalarToNextMultipleOf(0, 32);
820 .minScalarOrElt(0,
S32)
839 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
851 .clampMaxNumElements(0,
S8, 2)
870 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
882 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
889 .clampScalar(0,
S16,
S64);
921 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
945 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
979 .legalFor(FPTypesPK16)
993 .clampScalar(0,
S16,
S64);
1018 .clampScalar(0,
S32,
S64);
1023 .clampScalar(0,
S32,
S64);
1029 .clampScalar(0,
S32,
S64)
1030 .clampScalar(1,
S32,
S32)
1037 .clampScalar(1,
S32,
S32)
1073 FMad.customFor({
S32,
S16});
1075 FMad.customFor({
S32});
1077 FMad.customFor({
S16});
1085 FRem.minScalar(0,
S32)
1094 .clampMaxNumElements(0,
S16, 2)
1105 .clampScalar(0,
S32,
S64)
1106 .widenScalarToNextPow2(1, 32);
1134 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1140 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1144 if (
ST.has16BitInsts()) {
1145 getActionDefinitionsBuilder(
1146 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1148 .clampScalar(0,
S16,
S64)
1151 getActionDefinitionsBuilder(
1152 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1154 .clampScalar(0,
S32,
S64)
1157 getActionDefinitionsBuilder(
1158 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1161 .clampScalar(0,
S32,
S64)
1165 getActionDefinitionsBuilder(G_PTR_ADD)
1166 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1169 .scalarSameSizeAs(1, 0);
1171 getActionDefinitionsBuilder(G_PTRMASK)
1173 .scalarSameSizeAs(1, 0)
1177 getActionDefinitionsBuilder(G_ICMP)
1188 .legalForCartesianProduct(
1189 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1190 .legalForCartesianProduct(
1191 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1192 if (
ST.has16BitInsts()) {
1193 CmpBuilder.legalFor({{
S1,
S16}});
1197 .widenScalarToNextPow2(1)
1198 .clampScalar(1,
S32,
S64)
1203 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1204 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1206 if (
ST.hasSALUFloatInsts())
1207 FCmpBuilder.legalForCartesianProduct({
S32}, {
S16,
S32});
1210 .widenScalarToNextPow2(1)
1211 .clampScalar(1,
S32,
S64)
1215 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1216 if (
ST.has16BitInsts())
1217 ExpOps.customFor({{
S32}, {
S16}});
1219 ExpOps.customFor({
S32});
1220 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1223 getActionDefinitionsBuilder(G_FPOWI)
1224 .clampScalar(0, MinScalarFPTy,
S32)
1227 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1228 Log2Ops.customFor({
S32});
1229 if (
ST.has16BitInsts())
1230 Log2Ops.legalFor({
S16});
1232 Log2Ops.customFor({
S16});
1233 Log2Ops.scalarize(0)
1237 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1238 LogOps.customFor({
S32,
S16});
1239 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1243 getActionDefinitionsBuilder(G_CTPOP)
1245 .clampScalar(0,
S32,
S32)
1246 .widenScalarToNextPow2(1, 32)
1247 .clampScalar(1,
S32,
S64)
1249 .widenScalarToNextPow2(0, 32);
1252 if (
ST.has16BitInsts())
1253 getActionDefinitionsBuilder(G_IS_FPCLASS)
1254 .legalForCartesianProduct({
S1}, FPTypes16)
1255 .widenScalarToNextPow2(1)
1259 getActionDefinitionsBuilder(G_IS_FPCLASS)
1260 .legalForCartesianProduct({
S1}, FPTypesBase)
1261 .lowerFor({
S1,
S16})
1262 .widenScalarToNextPow2(1)
1269 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1271 .clampScalar(0,
S32,
S32)
1272 .clampScalar(1,
S32,
S64)
1273 .widenScalarToNextPow2(0, 32)
1274 .widenScalarToNextPow2(1, 32)
1278 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1281 .clampScalar(0,
S32,
S32)
1282 .clampScalar(1,
S32,
S64)
1284 .widenScalarToNextPow2(0, 32)
1285 .widenScalarToNextPow2(1, 32);
1287 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1289 .clampScalar(0,
S32,
S32)
1290 .clampScalar(1,
S32,
S64)
1292 .widenScalarToNextPow2(0, 32)
1293 .widenScalarToNextPow2(1, 32);
1297 getActionDefinitionsBuilder(G_BITREVERSE)
1299 .clampScalar(0,
S32,
S64)
1301 .widenScalarToNextPow2(0);
1303 if (
ST.has16BitInsts()) {
1304 getActionDefinitionsBuilder(G_BSWAP)
1306 .clampMaxNumElementsStrict(0,
S16, 2)
1309 .widenScalarToNextPow2(0)
1310 .clampScalar(0,
S16,
S32)
1313 if (
ST.hasVOP3PInsts()) {
1314 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1316 .clampMaxNumElements(0,
S16, 2)
1318 .widenScalarToNextPow2(0)
1322 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1324 .widenScalarToNextPow2(0)
1331 getActionDefinitionsBuilder(G_BSWAP)
1336 .widenScalarToNextPow2(0)
1341 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1344 .widenScalarToNextPow2(0)
1349 getActionDefinitionsBuilder(G_INTTOPTR)
1351 .legalForCartesianProduct(AddrSpaces64, {
S64})
1352 .legalForCartesianProduct(AddrSpaces32, {
S32})
1365 getActionDefinitionsBuilder(G_PTRTOINT)
1367 .legalForCartesianProduct(AddrSpaces64, {
S64})
1368 .legalForCartesianProduct(AddrSpaces32, {
S32})
1381 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1385 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1386 bool IsLoad) ->
bool {
1390 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1404 unsigned NumRegs = (MemSize + 31) / 32;
1406 if (!
ST.hasDwordx3LoadStores())
1417 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1418 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1419 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1425 for (
unsigned Op : {G_LOAD, G_STORE}) {
1426 const bool IsStore =
Op == G_STORE;
1428 auto &Actions = getActionDefinitionsBuilder(
Op);
1431 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1434 {
S64, GlobalPtr,
S64, GlobalAlign32},
1437 {
S32, GlobalPtr,
S8, GlobalAlign8},
1438 {
S32, GlobalPtr,
S16, GlobalAlign16},
1440 {
S32, LocalPtr,
S32, 32},
1441 {
S64, LocalPtr,
S64, 32},
1443 {
S32, LocalPtr,
S8, 8},
1444 {
S32, LocalPtr,
S16, 16},
1447 {
S32, PrivatePtr,
S32, 32},
1448 {
S32, PrivatePtr,
S8, 8},
1449 {
S32, PrivatePtr,
S16, 16},
1452 {
S32, ConstantPtr,
S32, GlobalAlign32},
1455 {
S64, ConstantPtr,
S64, GlobalAlign32},
1456 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1465 Actions.unsupportedIf(
1466 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1480 Actions.customIf(
typeIs(1, Constant32Ptr));
1506 return !Query.
Types[0].isVector() &&
1507 needToSplitMemOp(Query,
Op == G_LOAD);
1509 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1514 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1517 if (DstSize > MemSize)
1523 if (MemSize > MaxSize)
1531 return Query.
Types[0].isVector() &&
1532 needToSplitMemOp(Query,
Op == G_LOAD);
1534 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1548 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1549 if (MemSize > MaxSize) {
1553 if (MaxSize % EltSize == 0) {
1559 unsigned NumPieces = MemSize / MaxSize;
1563 if (NumPieces == 1 || NumPieces >= NumElts ||
1564 NumElts % NumPieces != 0)
1565 return std::pair(0, EltTy);
1573 return std::pair(0, EltTy);
1588 return std::pair(0, EltTy);
1592 .widenScalarToNextPow2(0)
1598 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1599 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1600 {
S32, GlobalPtr,
S16, 2 * 8},
1601 {
S32, LocalPtr,
S8, 8},
1602 {
S32, LocalPtr,
S16, 16},
1603 {
S32, PrivatePtr,
S8, 8},
1604 {
S32, PrivatePtr,
S16, 16},
1605 {
S32, ConstantPtr,
S8, 8},
1606 {
S32, ConstantPtr,
S16, 2 * 8}})
1612 if (
ST.hasFlatAddressSpace()) {
1613 ExtLoads.legalForTypesWithMemDesc(
1614 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1622 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1624 ExtLoads.clampScalar(0,
S32,
S32)
1625 .widenScalarToNextPow2(0)
1628 auto &Atomics = getActionDefinitionsBuilder(
1629 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1630 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1631 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1632 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1633 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1634 {
S64, GlobalPtr}, {
S64, LocalPtr},
1635 {
S32, RegionPtr}, {
S64, RegionPtr}});
1636 if (
ST.hasFlatAddressSpace()) {
1637 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1641 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1642 if (
ST.hasLDSFPAtomicAddF32()) {
1643 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1644 if (
ST.hasLdsAtomicAddF64())
1645 Atomic.legalFor({{
S64, LocalPtr}});
1646 if (
ST.hasAtomicDsPkAdd16Insts())
1647 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1649 if (
ST.hasAtomicFaddInsts())
1650 Atomic.legalFor({{
S32, GlobalPtr}});
1651 if (
ST.hasFlatAtomicFaddF32Inst())
1652 Atomic.legalFor({{
S32, FlatPtr}});
1654 if (
ST.hasGFX90AInsts()) {
1665 if (
ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1666 ST.hasAtomicBufferGlobalPkAddF16Insts())
1667 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1668 if (
ST.hasAtomicGlobalPkAddBF16Inst())
1669 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1670 if (
ST.hasAtomicFlatPkAdd16Insts())
1671 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1674 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1675 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1679 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1680 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1681 {
S32, FlatPtr}, {
S64, FlatPtr}})
1682 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1683 {
S32, RegionPtr}, {
S64, RegionPtr}});
1687 getActionDefinitionsBuilder(G_SELECT)
1689 LocalPtr, FlatPtr, PrivatePtr,
1693 .clampScalar(0,
S16,
S64)
1697 .clampMaxNumElements(0,
S32, 2)
1698 .clampMaxNumElements(0, LocalPtr, 2)
1699 .clampMaxNumElements(0, PrivatePtr, 2)
1701 .widenScalarToNextPow2(0)
1706 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1708 if (
ST.has16BitInsts()) {
1709 if (
ST.hasVOP3PInsts()) {
1711 .clampMaxNumElements(0,
S16, 2);
1713 Shifts.legalFor({{
S16,
S16}});
1716 Shifts.widenScalarIf(
1721 const LLT AmountTy = Query.
Types[1];
1726 Shifts.clampScalar(1,
S32,
S32);
1727 Shifts.widenScalarToNextPow2(0, 16);
1728 Shifts.clampScalar(0,
S16,
S64);
1730 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1738 Shifts.clampScalar(1,
S32,
S32);
1739 Shifts.widenScalarToNextPow2(0, 32);
1740 Shifts.clampScalar(0,
S32,
S64);
1742 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1747 Shifts.scalarize(0);
1749 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1750 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1751 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1752 unsigned IdxTypeIdx = 2;
1754 getActionDefinitionsBuilder(
Op)
1756 const LLT EltTy = Query.
Types[EltTypeIdx];
1757 const LLT VecTy = Query.
Types[VecTypeIdx];
1758 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1760 const bool isLegalVecType =
1770 return (EltSize == 32 || EltSize == 64) &&
1785 const LLT EltTy = Query.
Types[EltTypeIdx];
1786 const LLT VecTy = Query.
Types[VecTypeIdx];
1790 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1795 .clampScalar(EltTypeIdx,
S32,
S64)
1796 .clampScalar(VecTypeIdx,
S32,
S64)
1797 .clampScalar(IdxTypeIdx,
S32,
S32)
1798 .clampMaxNumElements(VecTypeIdx,
S32, 32)
1808 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1810 const LLT &EltTy = Query.
Types[1].getElementType();
1811 return Query.
Types[0] != EltTy;
1814 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1815 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1816 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1819 getActionDefinitionsBuilder(
Op)
1825 const LLT BigTy = Query.
Types[BigTyIdx];
1830 const LLT BigTy = Query.
Types[BigTyIdx];
1831 const LLT LitTy = Query.
Types[LitTyIdx];
1837 const LLT BigTy = Query.
Types[BigTyIdx];
1843 const LLT LitTy = Query.
Types[LitTyIdx];
1848 .widenScalarToNextPow2(BigTyIdx, 32);
1852 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1862 if (
ST.hasScalarPackInsts()) {
1865 .minScalarOrElt(0,
S16)
1868 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1872 BuildVector.customFor({
V2S16,
S16});
1873 BuildVector.minScalarOrElt(0,
S32);
1875 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1883 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1885 .clampMaxNumElements(0,
S32, 32)
1886 .clampMaxNumElements(1,
S16, 2)
1887 .clampMaxNumElements(0,
S16, 64);
1889 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1892 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1893 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1894 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1896 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1897 const LLT Ty = Query.
Types[TypeIdx];
1908 auto &Builder = getActionDefinitionsBuilder(
Op)
1912 const LLT BigTy = Query.
Types[BigTyIdx];
1918 .widenScalarToNextPow2(LitTyIdx, 16)
1926 .clampScalar(LitTyIdx,
S32,
S512)
1927 .widenScalarToNextPow2(LitTyIdx, 32)
1930 [=](
const LegalityQuery &Query) {
return notValidElt(Query, LitTyIdx); },
1933 [=](
const LegalityQuery &Query) {
return notValidElt(Query, BigTyIdx); },
1937 if (
Op == G_MERGE_VALUES) {
1938 Builder.widenScalarIf(
1941 const LLT Ty = Query.
Types[LitTyIdx];
1947 Builder.widenScalarIf(
1949 const LLT Ty = Query.
Types[BigTyIdx];
1955 const LLT &Ty = Query.
Types[BigTyIdx];
1957 if (NewSizeInBits >= 256) {
1959 if (RoundedTo < NewSizeInBits)
1960 NewSizeInBits = RoundedTo;
1962 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
1971 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1972 .legalFor({{
S32}, {
S64}});
1974 if (
ST.hasVOP3PInsts()) {
1975 SextInReg.lowerFor({{
V2S16}})
1979 .clampMaxNumElementsStrict(0,
S16, 2);
1980 }
else if (
ST.has16BitInsts()) {
1981 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
1985 SextInReg.lowerFor({{
S32}, {
S64}});
1990 .clampScalar(0,
S32,
S64)
1993 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1998 getActionDefinitionsBuilder(G_FSHR)
2001 .clampMaxNumElementsStrict(0,
S16, 2)
2005 if (
ST.hasVOP3PInsts()) {
2006 getActionDefinitionsBuilder(G_FSHL)
2008 .clampMaxNumElementsStrict(0,
S16, 2)
2012 getActionDefinitionsBuilder(G_FSHL)
2017 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2020 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
2022 getActionDefinitionsBuilder(G_FENCE)
2025 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2030 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2032 .clampScalar(1,
S32,
S32)
2033 .clampScalar(0,
S32,
S64)
2034 .widenScalarToNextPow2(0)
2037 getActionDefinitionsBuilder(
2041 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2042 G_READ_REGISTER, G_WRITE_REGISTER,
2047 if (
ST.hasIEEEMinMax()) {
2048 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2049 .legalFor(FPTypesPK16)
2050 .clampMaxNumElements(0,
S16, 2)
2054 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2057 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2060 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2062 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2063 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2064 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2067 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2069 getLegacyLegalizerInfo().computeTables();
2079 switch (
MI.getOpcode()) {
2080 case TargetOpcode::G_ADDRSPACE_CAST:
2082 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2084 case TargetOpcode::G_FCEIL:
2086 case TargetOpcode::G_FREM:
2088 case TargetOpcode::G_INTRINSIC_TRUNC:
2090 case TargetOpcode::G_SITOFP:
2092 case TargetOpcode::G_UITOFP:
2094 case TargetOpcode::G_FPTOSI:
2096 case TargetOpcode::G_FPTOUI:
2098 case TargetOpcode::G_FMINNUM:
2099 case TargetOpcode::G_FMAXNUM:
2100 case TargetOpcode::G_FMINNUM_IEEE:
2101 case TargetOpcode::G_FMAXNUM_IEEE:
2103 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2105 case TargetOpcode::G_INSERT_VECTOR_ELT:
2107 case TargetOpcode::G_FSIN:
2108 case TargetOpcode::G_FCOS:
2110 case TargetOpcode::G_GLOBAL_VALUE:
2112 case TargetOpcode::G_LOAD:
2113 case TargetOpcode::G_SEXTLOAD:
2114 case TargetOpcode::G_ZEXTLOAD:
2116 case TargetOpcode::G_STORE:
2118 case TargetOpcode::G_FMAD:
2120 case TargetOpcode::G_FDIV:
2122 case TargetOpcode::G_FFREXP:
2124 case TargetOpcode::G_FSQRT:
2126 case TargetOpcode::G_UDIV:
2127 case TargetOpcode::G_UREM:
2128 case TargetOpcode::G_UDIVREM:
2130 case TargetOpcode::G_SDIV:
2131 case TargetOpcode::G_SREM:
2132 case TargetOpcode::G_SDIVREM:
2134 case TargetOpcode::G_ATOMIC_CMPXCHG:
2136 case TargetOpcode::G_FLOG2:
2138 case TargetOpcode::G_FLOG:
2139 case TargetOpcode::G_FLOG10:
2141 case TargetOpcode::G_FEXP2:
2143 case TargetOpcode::G_FEXP:
2144 case TargetOpcode::G_FEXP10:
2146 case TargetOpcode::G_FPOW:
2148 case TargetOpcode::G_FFLOOR:
2150 case TargetOpcode::G_BUILD_VECTOR:
2151 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2153 case TargetOpcode::G_MUL:
2155 case TargetOpcode::G_CTLZ:
2156 case TargetOpcode::G_CTTZ:
2158 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2160 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2162 case TargetOpcode::G_STACKSAVE:
2164 case TargetOpcode::G_GET_FPENV:
2166 case TargetOpcode::G_SET_FPENV:
2168 case TargetOpcode::G_TRAP:
2170 case TargetOpcode::G_DEBUGTRAP:
2190 if (ST.hasApertureRegs()) {
2195 ? AMDGPU::SRC_SHARED_BASE
2196 : AMDGPU::SRC_PRIVATE_BASE;
2205 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2206 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {
Register(ApertureRegNo)});
2207 return B.buildUnmerge(
S32, Dst).getReg(1);
2212 Register LoadAddr =
MRI.createGenericVirtualRegister(
2222 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2224 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2238 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2241 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2244 Register QueuePtr =
MRI.createGenericVirtualRegister(
2260 B.buildPtrAdd(LoadAddr, QueuePtr,
2261 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2262 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2270 switch (Def->getOpcode()) {
2271 case AMDGPU::G_FRAME_INDEX:
2272 case AMDGPU::G_GLOBAL_VALUE:
2273 case AMDGPU::G_BLOCK_ADDR:
2275 case AMDGPU::G_CONSTANT: {
2276 const ConstantInt *CI = Def->getOperand(1).getCImm();
2293 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2294 (isa<GIntrinsic>(
MI) && cast<GIntrinsic>(
MI).getIntrinsicID() ==
2295 Intrinsic::amdgcn_addrspacecast_nonnull));
2299 Register Src = isa<GIntrinsic>(
MI) ?
MI.getOperand(2).getReg()
2300 :
MI.getOperand(1).getReg();
2301 LLT DstTy =
MRI.getType(Dst);
2302 LLT SrcTy =
MRI.getType(Src);
2313 if (
TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2314 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2325 B.buildExtract(Dst, Src, 0);
2326 MI.eraseFromParent();
2330 unsigned NullVal =
TM.getNullPointerValue(DestAS);
2332 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2333 auto FlatNull =
B.buildConstant(SrcTy, 0);
2336 auto PtrLo32 =
B.buildExtract(DstTy, Src, 0);
2340 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2342 MI.eraseFromParent();
2354 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2358 auto BuildPtr =
B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2363 B.buildCopy(Dst, BuildPtr);
2364 MI.eraseFromParent();
2368 auto SegmentNull =
B.buildConstant(SrcTy,
TM.getNullPointerValue(SrcAS));
2369 auto FlatNull =
B.buildConstant(DstTy,
TM.getNullPointerValue(DestAS));
2372 SegmentNull.getReg(0));
2374 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2376 MI.eraseFromParent();
2383 B.buildExtract(Dst, Src, 0);
2384 MI.eraseFromParent();
2392 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2393 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2394 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2395 MI.eraseFromParent();
2400 MF.
getFunction(),
"invalid addrspacecast",
B.getDebugLoc());
2403 Ctx.
diagnose(InvalidAddrSpaceCast);
2405 MI.eraseFromParent();
2413 LLT Ty =
MRI.getType(Src);
2419 auto C1 =
B.buildFConstant(Ty, C1Val);
2420 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2423 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2424 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2426 auto C2 =
B.buildFConstant(Ty, C2Val);
2427 auto Fabs =
B.buildFAbs(Ty, Src);
2430 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2431 MI.eraseFromParent();
2449 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2451 const auto Zero =
B.buildFConstant(
S64, 0.0);
2452 const auto One =
B.buildFConstant(
S64, 1.0);
2455 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2456 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2459 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2460 MI.eraseFromParent();
2468 Register Src0Reg =
MI.getOperand(1).getReg();
2469 Register Src1Reg =
MI.getOperand(2).getReg();
2470 auto Flags =
MI.getFlags();
2471 LLT Ty =
MRI.getType(DstReg);
2473 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2474 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2475 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2476 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2477 MI.eraseFromParent();
2483 const unsigned FractBits = 52;
2484 const unsigned ExpBits = 11;
2487 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2488 auto Const1 =
B.buildConstant(
S32, ExpBits);
2490 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2492 .addUse(Const0.getReg(0))
2493 .addUse(Const1.getReg(0));
2495 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2509 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2516 const unsigned FractBits = 52;
2519 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2520 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2522 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2524 const auto Zero32 =
B.buildConstant(
S32, 0);
2527 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2529 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2530 auto Not =
B.buildNot(
S64, Shr);
2531 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2532 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2537 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2538 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2539 MI.eraseFromParent();
2555 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2556 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2558 if (
MRI.getType(Dst) ==
S64) {
2559 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2560 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2562 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2563 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2566 B.buildFAdd(Dst, LdExp, CvtLo);
2567 MI.eraseFromParent();
2573 auto One =
B.buildConstant(
S32, 1);
2577 auto ThirtyOne =
B.buildConstant(
S32, 31);
2578 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2579 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2580 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2581 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2582 .addUse(Unmerge.getReg(1));
2583 auto LS2 =
B.buildSub(
S32, LS, One);
2584 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2586 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2587 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2588 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2589 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2590 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2591 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2592 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2593 B.buildFLdexp(Dst, FVal, Scale);
2594 MI.eraseFromParent();
2611 const LLT SrcLT =
MRI.getType(Src);
2614 unsigned Flags =
MI.getFlags();
2625 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2633 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2634 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2638 K0 =
B.buildFConstant(
2639 S64, llvm::bit_cast<double>(UINT64_C( 0x3df0000000000000)));
2640 K1 =
B.buildFConstant(
2641 S64, llvm::bit_cast<double>(UINT64_C( 0xc1f0000000000000)));
2643 K0 =
B.buildFConstant(
2644 S32, llvm::bit_cast<float>(UINT32_C( 0x2f800000)));
2645 K1 =
B.buildFConstant(
2646 S32, llvm::bit_cast<float>(UINT32_C( 0xcf800000)));
2649 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2650 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2651 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2654 :
B.buildFPTOUI(
S32, FloorMul);
2655 auto Lo =
B.buildFPTOUI(
S32, Fma);
2659 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2661 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2664 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2665 MI.eraseFromParent();
2675 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2676 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2699 LLT VecTy =
MRI.getType(Vec);
2712 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2713 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2714 B.buildIntToPtr(Dst, IntElt);
2716 MI.eraseFromParent();
2723 std::optional<ValueAndVReg> MaybeIdxVal =
2727 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2730 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2731 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2736 MI.eraseFromParent();
2751 LLT VecTy =
MRI.getType(Vec);
2765 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2766 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2767 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2769 B.buildIntToPtr(Dst, IntVecDest);
2770 MI.eraseFromParent();
2777 std::optional<ValueAndVReg> MaybeIdxVal =
2782 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2785 if (IdxVal < NumElts) {
2787 for (
unsigned i = 0; i < NumElts; ++i)
2788 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2789 B.buildUnmerge(SrcRegs, Vec);
2791 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2792 B.buildMergeLikeInstr(Dst, SrcRegs);
2797 MI.eraseFromParent();
2807 LLT Ty =
MRI.getType(DstReg);
2808 unsigned Flags =
MI.getFlags();
2813 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2814 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2815 .addUse(MulVal.getReg(0))
2819 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2822 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2826 MI.eraseFromParent();
2834 unsigned GAFlags)
const {
2835 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
2863 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2874 if (!
B.getMRI()->getRegClassOrNull(PCReg))
2875 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2878 B.buildExtract(DstReg, PCReg, 0);
2892 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
2894 :
MRI.createGenericVirtualRegister(
S32);
2896 if (!
MRI.getRegClassOrNull(AddrLo))
2897 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2900 B.buildInstr(AMDGPU::S_MOV_B32)
2905 if (RequiresHighHalf) {
2907 "Must provide a 64-bit pointer type!");
2910 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2912 B.buildInstr(AMDGPU::S_MOV_B32)
2922 if (!
MRI.getRegClassOrNull(AddrDst))
2923 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2925 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2929 if (AddrDst != DstReg)
2930 B.buildCast(DstReg, AddrDst);
2931 }
else if (AddrLo != DstReg) {
2934 B.buildCast(DstReg, AddrLo);
2942 LLT Ty =
MRI.getType(DstReg);
2951 GV->
getName() !=
"llvm.amdgcn.module.lds") {
2954 Fn,
"local memory global used by non-kernel function",
MI.getDebugLoc(),
2964 B.buildUndef(DstReg);
2965 MI.eraseFromParent();
2985 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2989 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
2990 B.buildIntToPtr(DstReg, Sz);
2991 MI.eraseFromParent();
2997 *cast<GlobalVariable>(GV)));
2998 MI.eraseFromParent();
3004 MI.eraseFromParent();
3012 MI.eraseFromParent();
3018 MI.eraseFromParent();
3023 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
3036 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3037 B.buildExtract(DstReg, Load, 0);
3039 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3041 MI.eraseFromParent();
3059 LLT PtrTy =
MRI.getType(PtrReg);
3064 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3066 MI.getOperand(1).setReg(Cast.getReg(0));
3071 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3075 LLT ValTy =
MRI.getType(ValReg);
3097 if (WideMemSize == ValSize) {
3103 MI.setMemRefs(MF, {WideMMO});
3109 if (ValSize > WideMemSize)
3116 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3117 B.buildTrunc(ValReg, WideLoad).getReg(0);
3124 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3125 B.buildExtract(ValReg, WideLoad, 0);
3129 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3130 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3134 MI.eraseFromParent();
3147 Register DataReg =
MI.getOperand(0).getReg();
3148 LLT DataTy =
MRI.getType(DataReg);
3162 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3191 "this should not have been custom lowered");
3193 LLT ValTy =
MRI.getType(CmpVal);
3196 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3198 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3202 .setMemRefs(
MI.memoperands());
3204 MI.eraseFromParent();
3213 case TargetOpcode::G_INTRINSIC: {
3215 case Intrinsic::amdgcn_frexp_mant:
3223 case TargetOpcode::G_FFREXP: {
3228 case TargetOpcode::G_FPEXT: {
3252std::pair<Register, Register>
3254 unsigned Flags)
const {
3259 auto SmallestNormal =
B.buildFConstant(
3261 auto IsLtSmallestNormal =
3264 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3265 auto One =
B.buildFConstant(
F32, 1.0);
3267 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3268 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3270 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3283 LLT Ty =
B.getMRI()->getType(Dst);
3284 unsigned Flags =
MI.getFlags();
3289 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3290 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3291 .addUse(Ext.getReg(0))
3293 B.buildFPTrunc(Dst,
Log2, Flags);
3294 MI.eraseFromParent();
3302 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3305 MI.eraseFromParent();
3309 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3310 .addUse(ScaledInput)
3313 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3314 auto Zero =
B.buildFConstant(Ty, 0.0);
3316 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3317 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3319 MI.eraseFromParent();
3325 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3326 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3331 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3332 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3337 unsigned Flags =
MI.getFlags();
3338 const LLT Ty =
MRI.getType(
X);
3348 TM.Options.ApproxFuncFPMath ||
TM.Options.UnsafeFPMath) {
3351 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3353 B.buildFPTrunc(Dst, LogVal);
3358 MI.eraseFromParent();
3367 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3372 const float c_log10 = 0x1.344134p-2f;
3373 const float cc_log10 = 0x1.09f79ep-26f;
3376 const float c_log = 0x1.62e42ep-1f;
3377 const float cc_log = 0x1.efa39ep-25f;
3379 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3380 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3382 R =
B.buildFMul(Ty,
Y,
C, Flags).getReg(0);
3383 auto NegR =
B.buildFNeg(Ty, R, Flags);
3384 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, Flags);
3385 auto FMA1 =
B.buildFMA(Ty,
Y,
CC, FMA0, Flags);
3386 R =
B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3389 const float ch_log10 = 0x1.344000p-2f;
3390 const float ct_log10 = 0x1.3509f6p-18f;
3393 const float ch_log = 0x1.62e000p-1f;
3394 const float ct_log = 0x1.0bfbe8p-15f;
3396 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3397 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3399 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3400 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3401 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3402 auto YTCT =
B.buildFMul(Ty, YT, CT, Flags);
3405 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3407 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, Flags);
3410 const bool IsFiniteOnly =
3414 if (!IsFiniteOnly) {
3417 auto Fabs =
B.buildFAbs(Ty,
Y);
3420 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3424 auto Zero =
B.buildFConstant(Ty, 0.0);
3426 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3427 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3428 B.buildFSub(Dst, R, Shift, Flags);
3430 B.buildCopy(Dst, R);
3433 MI.eraseFromParent();
3439 unsigned Flags)
const {
3440 const double Log2BaseInverted =
3443 LLT Ty =
B.getMRI()->getType(Dst);
3448 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3451 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3452 auto Zero =
B.buildFConstant(Ty, 0.0);
3454 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3455 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3458 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3460 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3461 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3469 ?
B.buildFLog2(Ty, Src, Flags)
3470 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3473 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3474 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3485 unsigned Flags =
MI.getFlags();
3486 LLT Ty =
B.getMRI()->getType(Dst);
3492 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3493 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3494 .addUse(Ext.getReg(0))
3496 B.buildFPTrunc(Dst,
Log2, Flags);
3497 MI.eraseFromParent();
3507 MI.eraseFromParent();
3515 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3517 RangeCheckConst, Flags);
3519 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3520 auto Zero =
B.buildFConstant(Ty, 0.0);
3521 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3522 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3524 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3525 .addUse(AddInput.getReg(0))
3528 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3529 auto One =
B.buildFConstant(Ty, 1.0);
3530 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3531 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3532 MI.eraseFromParent();
3538 LLT Ty =
B.getMRI()->getType(Dst);
3543 auto Mul =
B.buildFMul(Ty,
X, Log2E, Flags);
3547 .addUse(
Mul.getReg(0))
3550 B.buildFExp2(Dst,
Mul.getReg(0), Flags);
3556 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3559 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3560 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3561 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3564 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3566 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3567 .addUse(ExpInput.getReg(0))
3570 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3571 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3572 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3580 const unsigned Flags =
MI.getFlags();
3583 LLT Ty =
MRI.getType(Dst);
3586 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3593 MI.eraseFromParent();
3601 auto Ext =
B.buildFPExt(
F32,
X, Flags);
3604 B.buildFPTrunc(Dst, Lowered, Flags);
3605 MI.eraseFromParent();
3615 MI.eraseFromParent();
3643 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3648 const float cc_exp = 0x1.4ae0bep-26f;
3649 const float c_exp10 = 0x1.a934f0p+1f;
3650 const float cc_exp10 = 0x1.2f346ep-24f;
3652 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3653 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3654 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3655 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3657 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3658 PL =
B.buildFMA(Ty,
X,
CC, FMA0, Flags).getReg(0);
3660 const float ch_exp = 0x1.714000p+0f;
3661 const float cl_exp = 0x1.47652ap-12f;
3663 const float ch_exp10 = 0x1.a92000p+1f;
3664 const float cl_exp10 = 0x1.4f0978p-11f;
3666 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3667 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3668 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3670 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3671 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3673 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3674 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3677 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3678 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3681 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3684 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
3685 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3688 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3689 .addUse(
A.getReg(0))
3691 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3693 auto UnderflowCheckConst =
3694 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3695 auto Zero =
B.buildFConstant(Ty, 0.0);
3699 R =
B.buildSelect(Ty, Underflow, Zero, R);
3704 auto OverflowCheckConst =
3705 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3710 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3713 B.buildCopy(Dst, R);
3714 MI.eraseFromParent();
3723 unsigned Flags =
MI.getFlags();
3724 LLT Ty =
B.getMRI()->getType(Dst);
3729 auto Log =
B.buildFLog2(
F32, Src0, Flags);
3730 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3731 .addUse(Log.getReg(0))
3734 B.buildFExp2(Dst,
Mul, Flags);
3735 }
else if (Ty == F16) {
3737 auto Log =
B.buildFLog2(F16, Src0, Flags);
3738 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
3739 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
3740 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3741 .addUse(Ext0.getReg(0))
3742 .addUse(Ext1.getReg(0))
3744 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
3748 MI.eraseFromParent();
3756 ModSrc = SrcFNeg->getOperand(1).getReg();
3758 ModSrc = SrcFAbs->getOperand(1).getReg();
3760 ModSrc = SrcFAbs->getOperand(1).getReg();
3771 Register OrigSrc =
MI.getOperand(1).getReg();
3772 unsigned Flags =
MI.getFlags();
3774 "this should not have been custom lowered");
3784 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
3796 B.buildFConstant(
F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3804 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3806 B.buildFMinNum(Min, Fract, Const, Flags);
3811 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
3814 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
3815 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3817 MI.eraseFromParent();
3833 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3835 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
3836 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
3839 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
3840 B.buildBitcast(Dst,
Merge);
3842 MI.eraseFromParent();
3859 bool UsePartialMad64_32,
3860 bool SeparateOddAlignedProducts)
const {
3875 auto getZero32 = [&]() ->
Register {
3877 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
3880 auto getZero64 = [&]() ->
Register {
3882 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
3887 for (
unsigned i = 0; i < Src0.
size(); ++i) {
3898 if (CarryIn.empty())
3901 bool HaveCarryOut =
true;
3903 if (CarryIn.size() == 1) {
3905 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3909 CarryAccum = getZero32();
3911 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3912 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3914 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
3919 LocalAccum = getZero32();
3920 HaveCarryOut =
false;
3925 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
3926 LocalAccum =
Add.getReg(0);
3940 auto buildMadChain =
3943 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
3944 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
3951 if (LocalAccum.size() == 1 &&
3952 (!UsePartialMad64_32 || !CarryIn.empty())) {
3955 unsigned j1 = DstIndex - j0;
3956 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3960 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
3962 LocalAccum[0] =
Mul.getReg(0);
3964 if (CarryIn.empty()) {
3965 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
3968 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
3974 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3978 if (j0 <= DstIndex) {
3979 bool HaveSmallAccum =
false;
3982 if (LocalAccum[0]) {
3983 if (LocalAccum.size() == 1) {
3984 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
3985 HaveSmallAccum =
true;
3986 }
else if (LocalAccum[1]) {
3987 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
3988 HaveSmallAccum =
false;
3990 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
3991 HaveSmallAccum =
true;
3994 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3996 HaveSmallAccum =
true;
4000 unsigned j1 = DstIndex - j0;
4001 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4005 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4006 {Src0[j0], Src1[j1], Tmp});
4007 Tmp = Mad.getReg(0);
4008 if (!HaveSmallAccum)
4009 CarryOut.push_back(Mad.getReg(1));
4010 HaveSmallAccum =
false;
4013 }
while (j0 <= DstIndex);
4015 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4016 LocalAccum[0] = Unmerge.getReg(0);
4017 if (LocalAccum.size() > 1)
4018 LocalAccum[1] = Unmerge.getReg(1);
4045 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4046 Carry OddCarryIn = std::move(OddCarry);
4047 Carry EvenCarryIn = std::move(EvenCarry);
4052 if (2 * i < Accum.
size()) {
4053 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4054 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4059 if (!SeparateOddAlignedProducts) {
4060 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4061 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4063 bool IsHighest = 2 * i >= Accum.
size();
4067 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4073 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4075 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4077 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4080 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4083 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4084 Lo->getOperand(1).getReg());
4085 Accum[2 * i] =
Hi.getReg(0);
4086 SeparateOddCarry =
Hi.getReg(1);
4093 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4094 EvenCarryIn.push_back(CarryOut);
4096 if (2 * i < Accum.
size()) {
4097 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4098 OddCarry.push_back(CarryOut);
4111 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4120 LLT Ty =
MRI.getType(DstReg);
4124 unsigned NumParts =
Size / 32;
4140 for (
unsigned i = 0; i < NumParts; ++i) {
4144 B.buildUnmerge(Src0Parts, Src0);
4145 B.buildUnmerge(Src1Parts, Src1);
4148 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4149 SeparateOddAlignedProducts);
4151 B.buildMergeLikeInstr(DstReg, AccumRegs);
4152 MI.eraseFromParent();
4164 LLT DstTy =
MRI.getType(Dst);
4165 LLT SrcTy =
MRI.getType(Src);
4167 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4168 ? AMDGPU::G_AMDGPU_FFBH_U32
4169 : AMDGPU::G_AMDGPU_FFBL_B32;
4170 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4173 MI.eraseFromParent();
4182 LLT SrcTy =
MRI.getType(Src);
4187 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4188 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4189 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4190 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4191 B.buildTrunc(Dst, Ctlz);
4192 MI.eraseFromParent();
4198 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4201 return ConstVal && *ConstVal == -1;
4208 Register CondDef =
MI.getOperand(0).getReg();
4209 if (!
MRI.hasOneNonDBGUse(CondDef))
4217 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4223 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4232 if (Next == Parent->
end()) {
4236 UncondBrTarget = &*NextMBB;
4238 if (Next->getOpcode() != AMDGPU::G_BR)
4256 *ArgRC,
B.getDebugLoc(), ArgTy);
4260 const unsigned Mask = Arg->
getMask();
4261 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4268 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4269 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4272 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4274 B.buildCopy(DstReg, LiveIn);
4303 Arg = &WorkGroupIDX;
4304 ArgRC = &AMDGPU::SReg_32RegClass;
4308 Arg = &WorkGroupIDY;
4309 ArgRC = &AMDGPU::SReg_32RegClass;
4313 Arg = &WorkGroupIDZ;
4314 ArgRC = &AMDGPU::SReg_32RegClass;
4329 B.buildConstant(DstReg, 0);
4335 B.buildUndef(DstReg);
4339 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4350 MI.eraseFromParent();
4356 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4357 MI.eraseFromParent();
4378 B.buildUndef(DstReg);
4379 MI.eraseFromParent();
4383 if (Arg->isMasked()) {
4397 MI.eraseFromParent();
4404 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4414 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4422 Align Alignment)
const {
4426 "unexpected kernarg parameter type");
4430 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
4433 MI.eraseFromParent();
4441 LLT DstTy =
MRI.getType(Dst);
4468 auto FloatY =
B.buildUITOFP(
S32,
Y);
4469 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
4470 auto Scale =
B.buildFConstant(
S32, llvm::bit_cast<float>(0x4f7ffffe));
4471 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
4472 auto Z =
B.buildFPTOUI(
S32, ScaledY);
4475 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
4476 auto NegYZ =
B.buildMul(
S32, NegY, Z);
4477 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
4480 auto Q =
B.buildUMulH(
S32,
X, Z);
4481 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
4484 auto One =
B.buildConstant(
S32, 1);
4487 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
4493 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
4496 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
4515 auto Unmerge =
B.buildUnmerge(
S32, Val);
4517 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
4518 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
4520 auto Mad =
B.buildFMAD(
4522 B.buildFConstant(
S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4524 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
4525 auto Mul1 =
B.buildFMul(
4526 S32, Rcp,
B.buildFConstant(
S32, llvm::bit_cast<float>(0x5f7ffffc)));
4529 auto Mul2 =
B.buildFMul(
4530 S32, Mul1,
B.buildFConstant(
S32, llvm::bit_cast<float>(0x2f800000)));
4531 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
4534 auto Mad2 =
B.buildFMAD(
4535 S32, Trunc,
B.buildFConstant(
S32, llvm::bit_cast<float>(0xcf800000)),
4538 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
4539 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
4541 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4556 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
4558 auto Zero64 =
B.buildConstant(
S64, 0);
4559 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
4561 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
4562 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
4564 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
4565 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4566 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4568 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
4569 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4570 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
4572 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
4573 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
4574 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
4575 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4576 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4578 auto Zero32 =
B.buildConstant(
S32, 0);
4579 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
4580 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4581 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
4583 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
4584 Register NumerLo = UnmergeNumer.getReg(0);
4585 Register NumerHi = UnmergeNumer.getReg(1);
4587 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
4588 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
4589 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
4590 Register Mul3_Lo = UnmergeMul3.getReg(0);
4591 Register Mul3_Hi = UnmergeMul3.getReg(1);
4592 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
4593 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4594 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
4595 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
4597 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
4598 Register DenomLo = UnmergeDenom.getReg(0);
4599 Register DenomHi = UnmergeDenom.getReg(1);
4602 auto C1 =
B.buildSExt(
S32, CmpHi);
4605 auto C2 =
B.buildSExt(
S32, CmpLo);
4608 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
4615 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
4616 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4617 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4618 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
4620 auto One64 =
B.buildConstant(
S64, 1);
4621 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
4627 auto C6 =
B.buildSelect(
4631 auto Add4 =
B.buildAdd(
S64, Add3, One64);
4632 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
4634 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4635 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4636 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
4642 auto Sel1 =
B.buildSelect(
4649 auto Sel2 =
B.buildSelect(
4660 switch (
MI.getOpcode()) {
4663 case AMDGPU::G_UDIV: {
4664 DstDivReg =
MI.getOperand(0).getReg();
4667 case AMDGPU::G_UREM: {
4668 DstRemReg =
MI.getOperand(0).getReg();
4671 case AMDGPU::G_UDIVREM: {
4672 DstDivReg =
MI.getOperand(0).getReg();
4673 DstRemReg =
MI.getOperand(1).getReg();
4680 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4681 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
4682 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
4683 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4692 MI.eraseFromParent();
4702 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4703 if (Ty !=
S32 && Ty !=
S64)
4706 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4711 auto LHSign =
B.buildAShr(Ty,
LHS, SignBitOffset);
4712 auto RHSign =
B.buildAShr(Ty,
RHS, SignBitOffset);
4714 LHS =
B.buildAdd(Ty,
LHS, LHSign).getReg(0);
4715 RHS =
B.buildAdd(Ty,
RHS, RHSign).getReg(0);
4717 LHS =
B.buildXor(Ty,
LHS, LHSign).getReg(0);
4718 RHS =
B.buildXor(Ty,
RHS, RHSign).getReg(0);
4720 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4721 switch (
MI.getOpcode()) {
4724 case AMDGPU::G_SDIV: {
4725 DstDivReg =
MI.getOperand(0).getReg();
4726 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4729 case AMDGPU::G_SREM: {
4730 DstRemReg =
MI.getOperand(0).getReg();
4731 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4734 case AMDGPU::G_SDIVREM: {
4735 DstDivReg =
MI.getOperand(0).getReg();
4736 DstRemReg =
MI.getOperand(1).getReg();
4737 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4738 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4749 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
4750 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4751 B.buildSub(DstDivReg, SignXor, Sign);
4755 auto Sign = LHSign.getReg(0);
4756 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4757 B.buildSub(DstRemReg, SignXor, Sign);
4760 MI.eraseFromParent();
4771 LLT ResTy =
MRI.getType(Res);
4778 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
4789 if (CLHS->isExactlyValue(1.0)) {
4790 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4794 MI.eraseFromParent();
4799 if (CLHS->isExactlyValue(-1.0)) {
4800 auto FNeg =
B.buildFNeg(ResTy,
RHS, Flags);
4801 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4802 .addUse(FNeg.getReg(0))
4805 MI.eraseFromParent();
4812 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
4817 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4820 B.buildFMul(Res,
LHS, RCP, Flags);
4822 MI.eraseFromParent();
4833 LLT ResTy =
MRI.getType(Res);
4839 if (!AllowInaccurateRcp)
4842 auto NegY =
B.buildFNeg(ResTy,
Y);
4843 auto One =
B.buildFConstant(ResTy, 1.0);
4845 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4849 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
4850 R =
B.buildFMA(ResTy, Tmp0, R, R);
4852 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
4853 R =
B.buildFMA(ResTy, Tmp1, R, R);
4855 auto Ret =
B.buildFMul(ResTy,
X, R);
4856 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
4858 B.buildFMA(Res, Tmp2, R, Ret);
4859 MI.eraseFromParent();
4878 auto LHSExt =
B.buildFPExt(
S32,
LHS, Flags);
4879 auto RHSExt =
B.buildFPExt(
S32,
RHS, Flags);
4881 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
4882 .addUse(RHSExt.getReg(0))
4885 auto QUOT =
B.buildFMul(
S32, LHSExt, RCP, Flags);
4886 auto RDst =
B.buildFPTrunc(
S16, QUOT, Flags);
4888 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4889 .addUse(RDst.getReg(0))
4894 MI.eraseFromParent();
4907 unsigned SPDenormMode =
4910 if (ST.hasDenormModeInst()) {
4912 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4914 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4915 B.buildInstr(AMDGPU::S_DENORM_MODE)
4916 .addImm(NewDenormModeValue);
4919 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4920 .addImm(SPDenormMode)
4942 auto One =
B.buildFConstant(
S32, 1.0f);
4944 auto DenominatorScaled =
4945 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
4950 auto NumeratorScaled =
4951 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
4957 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
4958 .addUse(DenominatorScaled.getReg(0))
4960 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
4963 const bool HasDynamicDenormals =
4968 if (!PreservesDenormals) {
4969 if (HasDynamicDenormals) {
4970 SavedSPDenormMode =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4971 B.buildInstr(AMDGPU::S_GETREG_B32)
4972 .addDef(SavedSPDenormMode)
4978 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
4979 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4980 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
4981 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
4982 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
4983 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4985 if (!PreservesDenormals) {
4986 if (HasDynamicDenormals) {
4987 assert(SavedSPDenormMode);
4988 B.buildInstr(AMDGPU::S_SETREG_B32)
4989 .addReg(SavedSPDenormMode)
4995 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
4996 .addUse(Fma4.getReg(0))
4997 .addUse(Fma1.getReg(0))
4998 .addUse(Fma3.getReg(0))
4999 .addUse(NumeratorScaled.getReg(1))
5002 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5003 .addUse(Fmas.getReg(0))
5008 MI.eraseFromParent();
5027 auto One =
B.buildFConstant(
S64, 1.0);
5029 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5035 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5037 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5038 .addUse(DivScale0.getReg(0))
5041 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5042 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5043 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5045 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5051 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5052 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5053 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5062 auto NumUnmerge =
B.buildUnmerge(
S32,
LHS);
5063 auto DenUnmerge =
B.buildUnmerge(
S32,
RHS);
5064 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5065 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5068 Scale1Unmerge.getReg(1));
5070 Scale0Unmerge.getReg(1));
5071 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5073 Scale = DivScale1.getReg(1);
5076 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5077 .addUse(Fma4.getReg(0))
5078 .addUse(Fma3.getReg(0))
5079 .addUse(
Mul.getReg(0))
5083 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5084 .addUse(Fmas.getReg(0))
5089 MI.eraseFromParent();
5101 LLT Ty =
MRI.getType(Res0);
5104 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5107 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5112 auto Fabs =
B.buildFAbs(Ty, Val);
5116 auto Zero =
B.buildConstant(InstrExpTy, 0);
5117 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5118 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5121 B.buildCopy(Res0, Mant);
5122 B.buildSExtOrTrunc(Res1, Exp);
5124 MI.eraseFromParent();
5139 auto Abs =
B.buildFAbs(
S32,
RHS, Flags);
5142 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5143 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5144 auto C2 =
B.buildFConstant(
S32, 1.0f);
5147 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5149 auto Mul0 =
B.buildFMul(
S32,
RHS, Sel, Flags);
5151 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5152 .addUse(Mul0.getReg(0))
5155 auto Mul1 =
B.buildFMul(
S32,
LHS, RCP, Flags);
5157 B.buildFMul(Res, Sel, Mul1, Flags);
5159 MI.eraseFromParent();
5168 unsigned Flags =
MI.getFlags();
5171 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5172 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5173 .addUse(Ext.getReg(0))
5175 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5176 MI.eraseFromParent();
5186 const unsigned Flags =
MI.getFlags();
5195 MI.eraseFromParent();
5199 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5201 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5202 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5203 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5208 .addUse(SqrtX.getReg(0))
5211 auto NegOne =
B.buildConstant(I32, -1);
5212 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5214 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5215 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5217 auto PosOne =
B.buildConstant(I32, 1);
5218 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5220 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5221 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5223 auto Zero =
B.buildFConstant(
F32, 0.0f);
5227 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5231 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5234 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5235 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5237 auto Half =
B.buildFConstant(
F32, 0.5f);
5238 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5239 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5240 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5241 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5242 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5243 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5244 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5245 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5248 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5250 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5252 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5255 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5257 MI.eraseFromParent();
5289 assert(
MRI.getType(Dst) ==
F64 &&
"only expect to lower f64 sqrt");
5292 unsigned Flags =
MI.getFlags();
5294 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
5296 auto ZeroInt =
B.buildConstant(
S32, 0);
5300 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
5301 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
5302 auto SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags);
5305 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX.getReg(0));
5307 auto Half =
B.buildFConstant(
F64, 0.5);
5308 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
5309 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
5311 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
5312 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
5314 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
5315 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
5317 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
5318 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
5320 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
5322 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
5323 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
5325 auto SqrtRet =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
5328 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
5329 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
5330 SqrtRet =
B.buildFLdexp(
F64, SqrtRet, ScaleDown, Flags);
5339 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5341 MI.eraseFromParent();
5348 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5372 auto Flags =
MI.getFlags();
5374 LLT Ty =
MRI.getType(Dst);
5384 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5394 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5395 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5400 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5402 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5403 MI.eraseFromParent();
5415 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5416 IID == Intrinsic::amdgcn_permlanex16;
5420 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
5422 case Intrinsic::amdgcn_readfirstlane:
5423 case Intrinsic::amdgcn_permlane64:
5424 return LaneOp.getReg(0);
5425 case Intrinsic::amdgcn_readlane:
5426 return LaneOp.addUse(Src1).getReg(0);
5427 case Intrinsic::amdgcn_writelane:
5428 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5429 case Intrinsic::amdgcn_permlane16:
5430 case Intrinsic::amdgcn_permlanex16: {
5434 return LaneOp.addUse(Src1)
5449 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5451 Src1 =
MI.getOperand(3).getReg();
5452 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5453 Src2 =
MI.getOperand(4).getReg();
5457 LLT Ty =
MRI.getType(DstReg);
5466 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
5471 if (IID == Intrinsic::amdgcn_writelane)
5474 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
5475 B.buildTrunc(DstReg, LaneOpDst);
5476 MI.eraseFromParent();
5491 PartialResTy = EltTy;
5500 unsigned NumParts =
Size / 32;
5505 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
5507 if (IID == Intrinsic::amdgcn_writelane)
5508 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
5510 for (
unsigned i = 0; i < NumParts; ++i) {
5511 Src0 = Src0Parts.
getReg(i);
5514 Src1 = Src1Parts.
getReg(i);
5516 if (IID == Intrinsic::amdgcn_writelane)
5517 Src2 = Src2Parts.
getReg(i);
5519 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5522 B.buildMergeLikeInstr(DstReg, PartialRes);
5523 MI.eraseFromParent();
5533 LLT DstTy =
MRI.getType(DstReg);
5536 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
5542 B.buildPtrAdd(DstReg, KernargPtrReg,
B.buildConstant(IdxTy,
Offset).getReg(0));
5553 Register Pointer =
MI.getOperand(2).getReg();
5555 Register NumRecords =
MI.getOperand(4).getReg();
5560 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
5561 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
5562 Register LowHalf = Unmerge.getReg(0);
5563 Register HighHalf = Unmerge.getReg(1);
5565 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
5566 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
5569 std::optional<ValueAndVReg> StrideConst =
5571 if (!StrideConst || !StrideConst->Value.isZero()) {
5574 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5575 uint32_t ShiftedStrideVal = StrideVal << 16;
5576 ShiftedStride =
B.buildConstant(
S32, ShiftedStrideVal);
5578 auto ExtStride =
B.buildAnyExt(
S32, Stride);
5579 auto ShiftConst =
B.buildConstant(
S32, 16);
5580 ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
5582 NewHighHalf =
B.buildOr(
S32, Masked, ShiftedStride);
5585 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5586 MI.eraseFromParent();
5603 MI.eraseFromParent();
5611 std::optional<uint32_t> KnownSize =
5613 if (KnownSize.has_value())
5614 B.buildConstant(DstReg, *KnownSize);
5632 MI.eraseFromParent();
5639 unsigned AddrSpace)
const {
5641 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32),
MI.getOperand(2).getReg());
5645 MI.eraseFromParent();
5655std::pair<Register, unsigned>
5664 std::tie(BaseReg, ImmOffset) =
5668 if (
MRI.getType(BaseReg).isPointer())
5669 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
5679 unsigned Overflow = ImmOffset & ~MaxImm;
5680 ImmOffset -= Overflow;
5681 if ((int32_t)Overflow < 0) {
5682 Overflow += ImmOffset;
5686 if (Overflow != 0) {
5688 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
5690 auto OverflowVal =
B.buildConstant(
S32, Overflow);
5691 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
5696 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
5698 return std::pair(BaseReg, ImmOffset);
5705 bool ImageStore)
const {
5708 LLT StoreVT =
MRI.getType(Reg);
5712 auto Unmerge =
B.buildUnmerge(
S16, Reg);
5715 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5716 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
5727 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
5729 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
5736 auto Unmerge =
B.buildUnmerge(
S16, Reg);
5737 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5739 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
5747 auto Unmerge =
B.buildUnmerge(
S32, Reg);
5748 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5750 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
5768 LLT Ty =
MRI->getType(VData);
5796 bool IsFormat)
const {
5798 LLT Ty =
MRI.getType(VData);
5800 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
5813 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5816 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
5820 VIndex =
MI.getOperand(3).getReg();
5823 VIndex =
B.buildConstant(
S32, 0).getReg(0);
5826 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
5827 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
5831 Format =
MI.getOperand(5 + OpOffset).getImm();
5835 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
5841 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5842 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5843 }
else if (IsFormat) {
5844 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5845 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5849 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5852 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5855 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5860 auto MIB =
B.buildInstr(Opc)
5871 MIB.addImm(AuxiliaryData)
5872 .addImm(HasVIndex ? -1 : 0)
5873 .addMemOperand(MMO);
5875 MI.eraseFromParent();
5881 unsigned ImmOffset,
unsigned Format,
5884 auto MIB =
B.buildInstr(Opc)
5895 MIB.addImm(AuxiliaryData)
5896 .addImm(HasVIndex ? -1 : 0)
5897 .addMemOperand(MMO);
5904 bool IsTyped)
const {
5914 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
5915 bool IsTFE =
MI.getNumExplicitDefs() == 2;
5917 StatusDst =
MI.getOperand(1).getReg();
5922 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
5925 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5928 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
5931 VIndex =
MI.getOperand(3 + OpOffset).getReg();
5934 VIndex =
B.buildConstant(
S32, 0).getReg(0);
5937 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
5938 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
5942 Format =
MI.getOperand(5 + OpOffset).getImm();
5946 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
5949 LLT Ty =
MRI.getType(Dst);
5954 Dst =
MI.getOperand(0).getReg();
5957 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
5968 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5969 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5970 }
else if (IsFormat) {
5974 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5976 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5977 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5982 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
5983 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5986 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
5987 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
5990 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
5991 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
5998 unsigned NumLoadDWords = NumValueDWords + 1;
6000 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6001 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6002 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6004 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6005 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6006 B.buildTrunc(Dst, ExtDst);
6007 }
else if (NumValueDWords == 1) {
6008 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6011 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6012 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6014 B.buildUnmerge(LoadElts, LoadDstReg);
6016 B.buildMergeLikeInstr(Dst, LoadElts);
6020 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6021 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6022 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6023 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6024 B.buildTrunc(Dst, LoadDstReg);
6025 }
else if (Unpacked && IsD16 && Ty.
isVector()) {
6027 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6028 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6029 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6030 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6032 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6034 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6035 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6036 B.buildMergeLikeInstr(Dst, Repack);
6039 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6042 MI.eraseFromParent();
6048 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6049 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6050 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6051 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6052 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6053 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6054 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6055 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6056 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6057 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6058 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6059 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6060 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6061 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6062 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6063 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6064 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6065 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6066 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6067 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6068 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6069 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6070 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6071 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6072 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6073 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6074 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6075 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6076 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6077 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6078 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6079 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6080 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6081 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6082 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6083 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6084 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6085 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6086 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6087 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6088 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6089 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6090 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6091 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6092 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6093 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6094 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6095 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6096 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6097 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6098 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6099 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6100 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6101 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6102 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6103 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6104 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6105 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6106 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6107 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6108 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6109 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6110 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6111 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6112 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6113 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6114 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6115 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6116 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6117 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6118 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6119 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6120 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6121 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6122 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6123 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6124 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6125 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6126 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6127 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6128 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6129 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6130 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6139 const bool IsCmpSwap =
6140 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6141 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6142 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6143 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6154 CmpVal =
MI.getOperand(3).getReg();
6159 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6160 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6163 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6166 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6169 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6172 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6173 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6174 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6193 .addImm(AuxiliaryData)
6194 .addImm(HasVIndex ? -1 : 0)
6195 .addMemOperand(MMO);
6197 MI.eraseFromParent();
6207 bool IsA16,
bool IsG16) {
6210 auto EndIdx =
Intr->VAddrEnd;
6212 for (
unsigned I =
Intr->VAddrStart;
I < EndIdx;
I++) {
6219 if ((I < Intr->GradientStart) ||
6220 (
I >=
Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6221 (
I >=
Intr->CoordStart && !IsA16)) {
6222 if ((I < Intr->GradientStart) && IsA16 &&
6223 (
B.getMRI()->getType(AddrReg) ==
S16)) {
6224 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
6228 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6232 "Bias needs to be converted to 16 bit in A16 mode");
6234 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
6240 if (((
I + 1) >= EndIdx) ||
6241 ((
Intr->NumGradients / 2) % 2 == 1 &&
6242 (
I ==
static_cast<unsigned>(
Intr->GradientStart +
6243 (
Intr->NumGradients / 2) - 1) ||
6244 I ==
static_cast<unsigned>(
Intr->GradientStart +
6245 Intr->NumGradients - 1))) ||
6247 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
6249 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6254 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6265 int DimIdx,
int NumVAddrs) {
6269 for (
int I = 0;
I != NumVAddrs; ++
I) {
6271 if (
SrcOp.isReg()) {
6277 int NumAddrRegs = AddrRegs.
size();
6278 if (NumAddrRegs != 1) {
6281 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6284 for (
int I = 1;
I != NumVAddrs; ++
I) {
6287 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
6309 const unsigned NumDefs =
MI.getNumExplicitDefs();
6310 const unsigned ArgOffset = NumDefs + 1;
6311 bool IsTFE = NumDefs == 2;
6325 Register VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6326 LLT Ty =
MRI->getType(VData);
6328 const bool IsAtomicPacked16Bit =
6329 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6330 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6334 MRI->getType(
MI.getOperand(ArgOffset +
Intr->GradientStart).getReg());
6336 MRI->getType(
MI.getOperand(ArgOffset +
Intr->CoordStart).getReg());
6339 const bool IsA16 = AddrTy ==
S16;
6343 if (!BaseOpcode->
Atomic) {
6344 DMask =
MI.getOperand(ArgOffset +
Intr->DMaskIndex).getImm();
6347 }
else if (DMask != 0) {
6349 }
else if (!IsTFE && !BaseOpcode->
Store) {
6351 B.buildUndef(
MI.getOperand(0));
6352 MI.eraseFromParent();
6360 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6361 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6362 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6363 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6364 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
6367 MI.setDesc(
B.getTII().get(NewOpcode));
6371 if (IsTFE && DMask == 0) {
6374 MI.getOperand(ArgOffset +
Intr->DMaskIndex).setImm(DMask);
6377 if (BaseOpcode->
Atomic) {
6379 LLT Ty =
MRI->getType(VData0);
6382 if (Ty.
isVector() && !IsAtomicPacked16Bit)
6389 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
6390 MI.getOperand(2).setReg(
Concat.getReg(0));
6391 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6395 unsigned CorrectedNumVAddrs =
Intr->NumVAddrs;
6404 if (IsA16 && !ST.
hasA16()) {
6412 if (IsA16 || IsG16) {
6422 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
6423 const bool UsePartialNSA =
6424 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
6426 if (UsePartialNSA) {
6430 auto Concat =
B.buildConcatVectors(
6431 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6432 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
6433 PackedRegs.
resize(NSAMaxSize);
6434 }
else if (!UseNSA && PackedRegs.
size() > 1) {
6436 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
6437 PackedRegs[0] =
Concat.getReg(0);
6441 const unsigned NumPacked = PackedRegs.
size();
6442 for (
unsigned I =
Intr->VAddrStart; I < Intr->VAddrEnd;
I++) {
6444 if (!
SrcOp.isReg()) {
6451 if (
I -
Intr->VAddrStart < NumPacked)
6452 SrcOp.setReg(PackedRegs[
I -
Intr->VAddrStart]);
6454 SrcOp.setReg(AMDGPU::NoRegister);
6473 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6474 const bool UsePartialNSA =
6475 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6477 if (UsePartialNSA) {
6479 ArgOffset +
Intr->VAddrStart + NSAMaxSize - 1,
6480 Intr->NumVAddrs - NSAMaxSize + 1);
6481 }
else if (!UseNSA &&
Intr->NumVAddrs > 1) {
6494 if (BaseOpcode->
Store) {
6500 if (RepackedReg != VData) {
6501 MI.getOperand(1).setReg(RepackedReg);
6512 if (NumElts < DMaskLanes)
6515 if (NumElts > 4 || DMaskLanes > 4)
6525 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6526 const LLT AdjustedTy =
6549 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
6550 unsigned RoundedSize = 32 * RoundedElts;
6554 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
6559 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
6565 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
6569 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6570 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
6572 Register NewResultReg =
MRI->createGenericVirtualRegister(LoadResultTy);
6574 MI.getOperand(0).setReg(NewResultReg);
6582 Dst1Reg =
MI.getOperand(1).getReg();
6583 if (
MRI->getType(Dst1Reg) !=
S32)
6587 MI.removeOperand(1);
6591 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6600 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6602 if (ResultNumRegs == 1) {
6604 ResultRegs[0] = NewResultReg;
6607 for (
int I = 0;
I != NumDataRegs; ++
I)
6608 ResultRegs[
I] =
MRI->createGenericVirtualRegister(RegTy);
6609 B.buildUnmerge(ResultRegs, NewResultReg);
6614 ResultRegs.
resize(NumDataRegs);
6620 B.buildTrunc(DstReg, ResultRegs[0]);
6626 B.buildBitcast(DstReg, ResultRegs[0]);
6640 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
6643 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
6647 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
6650 Register Undef =
B.buildUndef(Ty).getReg(0);
6651 for (
int I = 0;
I != NumElts; ++
I)
6656 LLT ResTy =
MRI->getType(ResultRegs[0]);
6658 padWithUndef(ResTy, NumElts - ResultRegs.
size());
6659 B.buildBuildVector(DstReg, ResultRegs);
6670 if (ResultRegs.
size() == 1) {
6671 NewResultReg = ResultRegs[0];
6672 }
else if (ResultRegs.
size() == 2) {
6674 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
6680 if (
MRI->getType(DstReg).getNumElements() <
6681 MRI->getType(NewResultReg).getNumElements()) {
6682 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6684 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6689 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
6690 B.buildConcatVectors(DstReg, ResultRegs);
6699 Register OrigDst =
MI.getOperand(0).getReg();
6701 LLT Ty =
B.getMRI()->getType(OrigDst);
6707 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6708 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6711 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
6713 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6722 B.setInsertPt(
B.getMBB(),
MI);
6727 B.setInsertPt(
B.getMBB(),
MI);
6733 MI.setDesc(
B.getTII().get(Opc));
6734 MI.removeOperand(1);
6737 const unsigned MemSize = (
Size + 7) / 8;
6738 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
6745 MI.addMemOperand(MF, MMO);
6746 if (Dst != OrigDst) {
6747 MI.getOperand(0).setReg(Dst);
6748 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6749 B.buildTrunc(OrigDst, Dst);
6787 MI.eraseFromParent();
6797 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
6799 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6803 MI.eraseFromParent();
6812 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6821 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
6837 Register LoadAddr =
MRI.createGenericVirtualRegister(
6839 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6842 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
6843 B.buildCopy(SGPR01, Temp);
6844 B.buildInstr(AMDGPU::S_TRAP)
6847 MI.eraseFromParent();
6858 B.buildCopy(SGPR01, LiveIn);
6859 B.buildInstr(AMDGPU::S_TRAP)
6863 MI.eraseFromParent();
6875 MI.eraseFromParent();
6879 B.buildInstr(AMDGPU::S_TRAP)
6881 MI.eraseFromParent();
6893 "debugtrap handler not supported",
6895 LLVMContext &Ctx =
B.getMF().getFunction().getContext();
6899 B.buildInstr(AMDGPU::S_TRAP)
6903 MI.eraseFromParent();
6916 Register NodePtr =
MI.getOperand(2).getReg();
6917 Register RayExtent =
MI.getOperand(3).getReg();
6918 Register RayOrigin =
MI.getOperand(4).getReg();
6920 Register RayInvDir =
MI.getOperand(6).getReg();
6925 "intrinsic not supported on subtarget",
6927 B.getMF().getFunction().getContext().diagnose(BadIntrin);
6934 const bool IsA16 =
MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6935 const bool Is64 =
MRI.getType(NodePtr).getSizeInBits() == 64;
6936 const unsigned NumVDataDwords = 4;
6937 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6938 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6942 const unsigned BaseOpcodes[2][2] = {
6943 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6944 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6945 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6949 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6950 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6951 : AMDGPU::MIMGEncGfx10NSA,
6952 NumVDataDwords, NumVAddrDwords);
6956 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6957 : AMDGPU::MIMGEncGfx10Default,
6958 NumVDataDwords, NumVAddrDwords);
6963 if (UseNSA && IsGFX11Plus) {
6965 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
6966 auto Merged =
B.buildMergeLikeInstr(
6967 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
6973 packLanes(RayOrigin);
6976 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
6977 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
6978 auto MergedDir =
B.buildMergeLikeInstr(
6981 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
6982 UnmergeRayDir.getReg(0)}))
6985 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
6986 UnmergeRayDir.getReg(1)}))
6989 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
6990 UnmergeRayDir.getReg(2)}))
6995 packLanes(RayInvDir);
6999 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7008 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7014 packLanes(RayOrigin);
7016 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7017 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7021 B.buildMergeLikeInstr(R1,
7022 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7023 B.buildMergeLikeInstr(
7024 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7025 B.buildMergeLikeInstr(
7026 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7032 packLanes(RayInvDir);
7039 Register MergedOps =
B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7044 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7053 .addImm(IsA16 ? 1 : 0)
7056 MI.eraseFromParent();
7063 int RoundMode =
MI.getOperand(2).getImm();
7066 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
7068 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
7073 .addDef(
MI.getOperand(0).getReg())
7074 .addUse(
MI.getOperand(1).getReg());
7076 MI.eraseFromParent();
7086 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7087 MI.eraseFromParent();
7098 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7099 auto LSB =
B.buildConstant(
S32, 25);
7100 auto Width =
B.buildConstant(
S32, 5);
7101 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7102 MI.eraseFromParent();
7116 if (
MRI.getType(Src) !=
S64)
7120 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7124 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7127 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7128 MI.eraseFromParent();
7136 if (
MRI.getType(Src) !=
S64)
7139 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
7143 .addReg(Unmerge.getReg(0));
7147 .addReg(Unmerge.getReg(1));
7148 MI.eraseFromParent();
7158 auto IntrID = cast<GIntrinsic>(
MI).getIntrinsicID();
7160 case Intrinsic::amdgcn_if:
7161 case Intrinsic::amdgcn_else: {
7164 bool Negated =
false;
7176 std::swap(CondBrTarget, UncondBrTarget);
7178 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7179 if (IntrID == Intrinsic::amdgcn_if) {
7180 B.buildInstr(AMDGPU::SI_IF)
7183 .addMBB(UncondBrTarget);
7185 B.buildInstr(AMDGPU::SI_ELSE)
7188 .addMBB(UncondBrTarget);
7197 B.buildBr(*CondBrTarget);
7200 MRI.setRegClass(Def,
TRI->getWaveMaskRegClass());
7201 MRI.setRegClass(
Use,
TRI->getWaveMaskRegClass());
7202 MI.eraseFromParent();
7203 BrCond->eraseFromParent();
7209 case Intrinsic::amdgcn_loop: {
7212 bool Negated =
false;
7222 std::swap(CondBrTarget, UncondBrTarget);
7224 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7225 B.buildInstr(AMDGPU::SI_LOOP)
7227 .addMBB(UncondBrTarget);
7232 B.buildBr(*CondBrTarget);
7234 MI.eraseFromParent();
7235 BrCond->eraseFromParent();
7236 MRI.setRegClass(Reg,
TRI->getWaveMaskRegClass());
7242 case Intrinsic::amdgcn_addrspacecast_nonnull:
7244 case Intrinsic::amdgcn_make_buffer_rsrc:
7246 case Intrinsic::amdgcn_kernarg_segment_ptr:
7249 B.buildConstant(
MI.getOperand(0).getReg(), 0);
7250 MI.eraseFromParent();
7256 case Intrinsic::amdgcn_implicitarg_ptr:
7258 case Intrinsic::amdgcn_workitem_id_x:
7261 case Intrinsic::amdgcn_workitem_id_y:
7264 case Intrinsic::amdgcn_workitem_id_z:
7267 case Intrinsic::amdgcn_workgroup_id_x:
7270 case Intrinsic::amdgcn_workgroup_id_y:
7273 case Intrinsic::amdgcn_workgroup_id_z:
7276 case Intrinsic::amdgcn_wave_id:
7278 case Intrinsic::amdgcn_lds_kernel_id:
7281 case Intrinsic::amdgcn_dispatch_ptr:
7284 case Intrinsic::amdgcn_queue_ptr:
7287 case Intrinsic::amdgcn_implicit_buffer_ptr:
7290 case Intrinsic::amdgcn_dispatch_id:
7293 case Intrinsic::r600_read_ngroups_x:
7297 case Intrinsic::r600_read_ngroups_y:
7300 case Intrinsic::r600_read_ngroups_z:
7303 case Intrinsic::r600_read_local_size_x:
7306 case Intrinsic::r600_read_local_size_y:
7310 case Intrinsic::r600_read_local_size_z:
7312 case Intrinsic::r600_read_global_size_x:
7314 case Intrinsic::r600_read_global_size_y:
7316 case Intrinsic::r600_read_global_size_z:
7318 case Intrinsic::amdgcn_fdiv_fast:
7320 case Intrinsic::amdgcn_is_shared:
7322 case Intrinsic::amdgcn_is_private:
7324 case Intrinsic::amdgcn_wavefrontsize: {
7326 MI.eraseFromParent();
7329 case Intrinsic::amdgcn_s_buffer_load:
7331 case Intrinsic::amdgcn_raw_buffer_store:
7332 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7333 case Intrinsic::amdgcn_struct_buffer_store:
7334 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7336 case Intrinsic::amdgcn_raw_buffer_store_format:
7337 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7338 case Intrinsic::amdgcn_struct_buffer_store_format:
7339 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7341 case Intrinsic::amdgcn_raw_tbuffer_store:
7342 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7343 case Intrinsic::amdgcn_struct_tbuffer_store:
7344 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7346 case Intrinsic::amdgcn_raw_buffer_load:
7347 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7348 case Intrinsic::amdgcn_struct_buffer_load:
7349 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7351 case Intrinsic::amdgcn_raw_buffer_load_format:
7352 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7353 case Intrinsic::amdgcn_struct_buffer_load_format:
7354 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7356 case Intrinsic::amdgcn_raw_tbuffer_load:
7357 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7358 case Intrinsic::amdgcn_struct_tbuffer_load:
7359 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7361 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7362 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7363 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7364 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7365 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7366 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7367 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7368 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7369 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7370 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7371 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7372 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7373 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7374 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7375 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7376 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7377 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7378 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7379 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7380 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7381 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7382 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7383 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7384 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7385 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7386 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7387 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7388 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7389 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7390 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7391 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7392 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7393 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7394 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7395 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7396 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7397 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7398 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7399 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7400 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7401 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7402 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7403 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7404 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7405 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7406 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7407 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7408 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7409 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7410 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7411 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7412 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7413 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7414 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7415 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7416 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7417 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7418 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7419 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7420 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7421 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7422 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7423 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7424 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7426 case Intrinsic::amdgcn_rsq_clamp:
7428 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7430 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7431 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7432 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7433 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7434 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7435 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7436 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7437 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7441 MI.getOperand(5).setReg(
B.buildAnyExt(
S32,
Index).getReg(0));
7444 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7445 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7446 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7450 MI.getOperand(7).setReg(
B.buildAnyExt(
S32,
Index).getReg(0));
7453 case Intrinsic::amdgcn_fmed3: {
7459 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7460 MI.removeOperand(1);
7464 case Intrinsic::amdgcn_readlane:
7465 case Intrinsic::amdgcn_writelane:
7466 case Intrinsic::amdgcn_readfirstlane:
7467 case Intrinsic::amdgcn_permlane16:
7468 case Intrinsic::amdgcn_permlanex16:
7469 case Intrinsic::amdgcn_permlane64:
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterType(LLT Ty)
static bool isRegisterVectorElementType(LLT EltTy)
static bool isRegisterSize(unsigned Size)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static std::initializer_list< LLT > AllS32Vectors
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static std::initializer_list< LLT > AllS16Vectors
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static bool isRegisterClassType(LLT Ty)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx)
static constexpr unsigned MaxRegisterSize
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static const LLT MaxScalar
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
static std::initializer_list< LLT > AllS64Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static std::initializer_list< LLT > AllScalarTypes
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static constexpr int Concat[]
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsTyped, bool IsFormat) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, bool IsFormat) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat, bool IsTyped) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isEntryFunction() const
bool isModuleEntryFunction() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasVOP3PInsts() const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This class represents an Operation in the Expression.
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasArchitectedSGPRs() const
bool hasPrivEnabledTrap2NopBug() const
const SIInstrInfo * getInstrInfo() const override
bool hasScalarSubwordLoads() const
bool supportsGetDoorbellID() const
TrapHandlerAbi getTrapHandlerAbi() const
bool hasGFX10_AEncoding() const
const SITargetLowering * getTargetLowering() const override
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasScalarSMulU64() const
bool hasNSAEncoding() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasScalarDwordx3Loads() const
Generation getGeneration() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasAddNoCarry() const
bool hasPartialNSAEncoding() const
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
KnownBits getKnownBits(Register R)
Simple wrapper observer that takes several observers, and calls each one for each event.
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
constexpr bool isPointerVector() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr ElementCount getElementCount() const
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr bool isPointerOrPointerVector() const
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LegalizeResult lowerFMad(MachineInstr &MI)
GISelKnownBits * getKnownBits() const
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
MutableArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
The instances of the Type class are immutable: once they are created, they are never changed.
A Use represents the edge between a Value definition and its users.
StringRef getName() const
Return a constant reference to the value's name.
constexpr ScalarTy getFixedValue() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
int popcount(T Value) noexcept
Count the number of set bits in a value.
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
@ Mul
Product of integers.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
std::function< bool(const LegalityQuery &)> LegalityPredicate
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
static const fltSemantics & IEEEsingle() LLVM_READNONE
static const fltSemantics & IEEEdouble() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.