37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
40#define DEBUG_TYPE "amdgpu-legalinfo"
50 "amdgpu-global-isel-new-legality",
51 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
67 unsigned Bits = Ty.getSizeInBits();
77 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
91 const LLT Ty = Query.Types[TypeIdx];
98 const LLT Ty = Query.Types[TypeIdx];
100 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
106 const LLT Ty = Query.Types[TypeIdx];
108 return std::pair(TypeIdx,
115 const LLT Ty = Query.Types[TypeIdx];
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (
Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
129 const LLT Ty = Query.Types[TypeIdx];
132 const int Size = Ty.getSizeInBits();
134 const int NextMul32 = (
Size + 31) / 32;
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
154 const LLT Ty = Query.Types[TypeIdx];
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
159 assert(EltSize == 32 || EltSize == 64);
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
168 return std::pair(TypeIdx,
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
188 const unsigned Size = Ty.getSizeInBits();
201 const LLT Ty = Query.Types[TypeIdx];
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
218 const LLT QueryTy = Query.Types[TypeIdx];
225 const LLT QueryTy = Query.Types[TypeIdx];
232 const LLT QueryTy = Query.Types[TypeIdx];
238 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
244 return EltSize == 16 || EltSize % 32 == 0;
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
280 LLT Ty = Query.Types[TypeIdx];
288 const LLT QueryTy = Query.Types[TypeIdx];
372 if (Ty.isPointerOrPointerVector())
373 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
377 (ST.useRealTrue16Insts() && Ty ==
S16) ||
392 const LLT Ty = Query.Types[TypeIdx];
393 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
394 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
402 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
412 bool IsLoad,
bool IsAtomic) {
416 return ST.hasFlatScratchEnabled() ? 128 : 32;
418 return ST.useDS128() ? 128 : 64;
429 return IsLoad ? 512 : 128;
434 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
443 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
445 unsigned RegSize = Ty.getSizeInBits();
448 unsigned AS = Query.
Types[1].getAddressSpace();
455 if (Ty.isVector() && MemSize !=
RegSize)
462 if (IsLoad && MemSize <
Size)
463 MemSize = std::max(MemSize,
Align);
483 if (!ST.hasDwordx3LoadStores())
496 if (AlignBits < MemSize) {
499 Align(AlignBits / 8)))
529 const unsigned Size = Ty.getSizeInBits();
530 if (Ty.isPointerVector())
540 unsigned EltSize = Ty.getScalarSizeInBits();
541 return EltSize != 32 && EltSize != 64;
555 const unsigned Size = Ty.getSizeInBits();
556 if (
Size != MemSizeInBits)
557 return Size <= 32 && Ty.isVector();
563 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
572 uint64_t AlignInBits,
unsigned AddrSpace,
582 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
593 if (AlignInBits < RoundedSize)
600 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
612 Query.
Types[1].getAddressSpace(), Opcode);
632 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
635 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
636 std::array<Register, 4> VectorElems;
637 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
638 for (
unsigned I = 0;
I < NumParts; ++
I)
640 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
641 B.buildMergeValues(MO, VectorElems);
645 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
646 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
647 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
648 B.buildIntToPtr(MO, Scalar);
668 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
669 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
670 for (
unsigned I = 0;
I < NumParts; ++
I)
672 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
674 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
675 return B.buildBitcast(VectorTy, Scalar).getReg(0);
694 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
707 const LLT BufferStridedPtr =
710 const LLT CodePtr = FlatPtr;
712 const std::initializer_list<LLT> AddrSpaces64 = {
713 GlobalPtr, ConstantPtr, FlatPtr
716 const std::initializer_list<LLT> AddrSpaces32 = {
717 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
720 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
722 const std::initializer_list<LLT> FPTypesBase = {
726 const std::initializer_list<LLT> FPTypes16 = {
730 const std::initializer_list<LLT> FPTypesPK16 = {
734 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
755 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
757 if (ST.hasScalarAddSub64()) {
760 .clampMaxNumElementsStrict(0,
S16, 2)
768 .clampMaxNumElementsStrict(0,
S16, 2)
775 if (ST.hasScalarSMulU64()) {
778 .clampMaxNumElementsStrict(0,
S16, 2)
786 .clampMaxNumElementsStrict(0,
S16, 2)
796 .minScalarOrElt(0,
S16)
801 }
else if (ST.has16BitInsts()) {
835 .widenScalarToNextMultipleOf(0, 32)
845 if (ST.hasMad64_32())
850 if (ST.hasIntClamp()) {
873 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
883 if (ST.hasVOP3PInsts()) {
885 .clampMaxNumElements(0,
S8, 2)
906 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
918 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
925 .clampScalar(0,
S16,
S64);
958 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
959 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
966 if (ST.has16BitInsts()) {
967 if (ST.hasVOP3PInsts())
970 FPOpActions.legalFor({
S16});
972 TrigActions.customFor({
S16});
973 FDIVActions.customFor({
S16});
976 if (ST.hasPackedFP32Ops()) {
977 FPOpActions.legalFor({
V2S32});
978 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
981 auto &MinNumMaxNumIeee =
984 if (ST.hasVOP3PInsts()) {
985 MinNumMaxNumIeee.legalFor(FPTypesPK16)
987 .clampMaxNumElements(0,
S16, 2)
990 }
else if (ST.has16BitInsts()) {
991 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
993 MinNumMaxNumIeee.legalFor(FPTypesBase)
999 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1001 if (ST.hasVOP3PInsts()) {
1002 MinNumMaxNum.customFor(FPTypesPK16)
1004 .clampMaxNumElements(0,
S16, 2)
1005 .clampScalar(0,
S16,
S64)
1007 }
else if (ST.has16BitInsts()) {
1008 MinNumMaxNum.customFor(FPTypes16)
1009 .clampScalar(0,
S16,
S64)
1012 MinNumMaxNum.customFor(FPTypesBase)
1013 .clampScalar(0,
S32,
S64)
1017 if (ST.hasVOP3PInsts())
1033 .legalFor(FPTypesPK16)
1038 if (ST.has16BitInsts()) {
1072 if (ST.hasFractBug()) {
1106 if (ST.hasCvtPkF16F32Inst()) {
1108 .clampMaxNumElements(0,
S16, 2);
1112 FPTruncActions.scalarize(0).lower();
1120 if (ST.has16BitInsts()) {
1140 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1141 FMad.customFor({
S32,
S16});
1142 else if (ST.hasMadMacF32Insts())
1143 FMad.customFor({
S32});
1144 else if (ST.hasMadF16())
1145 FMad.customFor({
S16});
1150 if (ST.has16BitInsts()) {
1153 FRem.minScalar(0,
S32)
1162 .clampMaxNumElements(0,
S16, 2)
1181 if (ST.has16BitInsts())
1192 if (ST.has16BitInsts())
1214 .clampScalar(0,
S16,
S64)
1229 .clampScalar(0,
S16,
S64)
1233 if (ST.has16BitInsts()) {
1235 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1237 .clampScalar(0,
S16,
S64)
1241 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1243 .clampScalar(0,
S32,
S64)
1247 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1250 .clampScalar(0,
S32,
S64)
1262 .scalarSameSizeAs(1, 0)
1278 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1279 .legalForCartesianProduct(
1280 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1281 if (ST.has16BitInsts()) {
1282 CmpBuilder.legalFor({{
S1,
S16}});
1293 {
S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1295 if (ST.hasSALUFloatInsts())
1305 if (ST.has16BitInsts())
1306 ExpOps.customFor({{
S32}, {
S16}});
1308 ExpOps.customFor({
S32});
1309 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1317 .
legalFor(ST.has16BitInsts(), {S16})
1323 .
legalFor(ST.has16BitInsts(), {S16})
1337 .clampScalar(0,
S32,
S32)
1344 if (ST.has16BitInsts())
1347 .widenScalarToNextPow2(1)
1353 .lowerFor({
S1,
S16})
1354 .widenScalarToNextPow2(1)
1381 .clampScalar(0,
S32,
S32)
1391 .clampScalar(0,
S32,
S64)
1395 if (ST.has16BitInsts()) {
1398 .clampMaxNumElementsStrict(0,
S16, 2)
1405 if (ST.hasVOP3PInsts()) {
1408 .clampMaxNumElements(0,
S16, 2)
1413 if (ST.hasIntMinMax64()) {
1416 .clampMaxNumElements(0,
S16, 2)
1424 .clampMaxNumElements(0,
S16, 2)
1433 .widenScalarToNextPow2(0)
1461 .legalForCartesianProduct(AddrSpaces32, {
S32})
1477 .legalForCartesianProduct(AddrSpaces32, {
S32})
1494 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1495 bool IsLoad) ->
bool {
1499 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1513 unsigned NumRegs = (MemSize + 31) / 32;
1515 if (!ST.hasDwordx3LoadStores())
1526 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1527 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1528 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1534 for (
unsigned Op : {G_LOAD, G_STORE}) {
1535 const bool IsStore =
Op == G_STORE;
1540 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1543 {
S64, GlobalPtr,
S64, GlobalAlign32},
1546 {
S32, GlobalPtr,
S8, GlobalAlign8},
1547 {
S32, GlobalPtr,
S16, GlobalAlign16},
1549 {
S32, LocalPtr,
S32, 32},
1550 {
S64, LocalPtr,
S64, 32},
1552 {
S32, LocalPtr,
S8, 8},
1553 {
S32, LocalPtr,
S16, 16},
1556 {
S32, PrivatePtr,
S32, 32},
1557 {
S32, PrivatePtr,
S8, 8},
1558 {
S32, PrivatePtr,
S16, 16},
1561 {
S32, ConstantPtr,
S32, GlobalAlign32},
1564 {
S64, ConstantPtr,
S64, GlobalAlign32},
1565 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1574 Actions.unsupportedIf(
1575 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1589 Actions.customIf(
typeIs(1, Constant32Ptr));
1615 return !Query.
Types[0].isVector() &&
1616 needToSplitMemOp(Query,
Op == G_LOAD);
1618 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1623 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1626 if (DstSize > MemSize)
1632 if (MemSize > MaxSize)
1640 return Query.
Types[0].isVector() &&
1641 needToSplitMemOp(Query,
Op == G_LOAD);
1643 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1657 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1658 if (MemSize > MaxSize) {
1662 if (MaxSize % EltSize == 0) {
1668 unsigned NumPieces = MemSize / MaxSize;
1672 if (NumPieces == 1 || NumPieces >= NumElts ||
1673 NumElts % NumPieces != 0)
1674 return std::pair(0, EltTy);
1682 return std::pair(0, EltTy);
1697 return std::pair(0, EltTy);
1702 .widenScalarToNextPow2(0)
1709 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1710 {
S32, GlobalPtr,
S16, 2 * 8},
1711 {
S32, LocalPtr,
S8, 8},
1712 {
S32, LocalPtr,
S16, 16},
1713 {
S32, PrivatePtr,
S8, 8},
1714 {
S32, PrivatePtr,
S16, 16},
1715 {
S32, ConstantPtr,
S8, 8},
1716 {
S32, ConstantPtr,
S16, 2 * 8}})
1722 if (ST.hasFlatAddressSpace()) {
1723 ExtLoads.legalForTypesWithMemDesc(
1724 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1739 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1740 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1741 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1742 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1743 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1744 {
S64, GlobalPtr}, {
S64, LocalPtr},
1745 {
S32, RegionPtr}, {
S64, RegionPtr}});
1746 if (ST.hasFlatAddressSpace()) {
1747 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1752 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr}, {
S32, RegionPtr}});
1753 if (ST.hasFlatAddressSpace()) {
1754 Atomics32.legalFor({{
S32, FlatPtr}});
1759 if (ST.hasLDSFPAtomicAddF32()) {
1760 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1761 if (ST.hasLdsAtomicAddF64())
1762 Atomic.legalFor({{
S64, LocalPtr}});
1763 if (ST.hasAtomicDsPkAdd16Insts())
1764 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1766 if (ST.hasAtomicFaddInsts())
1767 Atomic.legalFor({{
S32, GlobalPtr}});
1768 if (ST.hasFlatAtomicFaddF32Inst())
1769 Atomic.legalFor({{
S32, FlatPtr}});
1771 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1782 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1783 ST.hasAtomicBufferGlobalPkAddF16Insts())
1784 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1785 if (ST.hasAtomicGlobalPkAddBF16Inst())
1786 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1787 if (ST.hasAtomicFlatPkAdd16Insts())
1788 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1793 auto &AtomicFMinFMax =
1795 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1797 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1799 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1800 AtomicFMinFMax.
legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1801 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1803 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1810 {
S32, FlatPtr}, {
S64, FlatPtr}})
1811 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1812 {
S32, RegionPtr}, {
S64, RegionPtr}});
1818 LocalPtr, FlatPtr, PrivatePtr,
1822 .clampScalar(0,
S16,
S64)
1837 if (ST.has16BitInsts()) {
1838 if (ST.hasVOP3PInsts()) {
1840 .clampMaxNumElements(0,
S16, 2);
1842 Shifts.legalFor({{
S16,
S16}});
1845 Shifts.widenScalarIf(
1850 const LLT AmountTy = Query.
Types[1];
1851 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1855 Shifts.clampScalar(1,
S32,
S32);
1856 Shifts.widenScalarToNextPow2(0, 16);
1857 Shifts.clampScalar(0,
S16,
S64);
1867 Shifts.clampScalar(1,
S32,
S32);
1868 Shifts.widenScalarToNextPow2(0, 32);
1869 Shifts.clampScalar(0,
S32,
S64);
1878 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1879 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1880 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1881 unsigned IdxTypeIdx = 2;
1885 const LLT EltTy = Query.
Types[EltTypeIdx];
1886 const LLT VecTy = Query.
Types[VecTypeIdx];
1887 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1889 const bool isLegalVecType =
1899 return (EltSize == 32 || EltSize == 64) &&
1915 const LLT EltTy = Query.
Types[EltTypeIdx];
1916 const LLT VecTy = Query.
Types[VecTypeIdx];
1920 const unsigned TargetEltSize =
1921 DstEltSize % 64 == 0 ? 64 : 32;
1922 return std::pair(VecTypeIdx,
1926 .clampScalar(EltTypeIdx,
S32,
S64)
1940 const LLT &EltTy = Query.
Types[1].getElementType();
1941 return Query.
Types[0] != EltTy;
1944 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1945 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1946 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1955 const LLT BigTy = Query.
Types[BigTyIdx];
1960 const LLT BigTy = Query.
Types[BigTyIdx];
1961 const LLT LitTy = Query.
Types[LitTyIdx];
1967 const LLT BigTy = Query.
Types[BigTyIdx];
1973 const LLT LitTy = Query.
Types[LitTyIdx];
1992 if (ST.hasScalarPackInsts()) {
1995 .minScalarOrElt(0,
S16)
2002 BuildVector.customFor({
V2S16,
S16});
2003 BuildVector.minScalarOrElt(0,
S32);
2022 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2023 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
2024 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
2026 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
2027 const LLT Ty = Query.
Types[TypeIdx];
2028 if (Ty.isVector()) {
2043 const LLT BigTy = Query.
Types[BigTyIdx];
2063 return notValidElt(Query, LitTyIdx);
2068 return notValidElt(Query, BigTyIdx);
2073 if (
Op == G_MERGE_VALUES) {
2074 Builder.widenScalarIf(
2077 const LLT Ty = Query.
Types[LitTyIdx];
2078 return Ty.getSizeInBits() < 32;
2085 const LLT Ty = Query.
Types[BigTyIdx];
2086 return Ty.getSizeInBits() % 16 != 0;
2091 const LLT &Ty = Query.
Types[BigTyIdx];
2092 unsigned NewSizeInBits = 1 <<
Log2_32_Ceil(Ty.getSizeInBits() + 1);
2093 if (NewSizeInBits >= 256) {
2094 unsigned RoundedTo =
alignTo<64>(Ty.getSizeInBits() + 1);
2095 if (RoundedTo < NewSizeInBits)
2096 NewSizeInBits = RoundedTo;
2098 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2109 .clampScalar(0,
S32,
S64);
2111 if (ST.hasVOP3PInsts()) {
2112 SextInReg.lowerFor({{
V2S16}})
2116 .clampMaxNumElementsStrict(0,
S16, 2);
2117 }
else if (ST.has16BitInsts()) {
2118 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2122 SextInReg.lowerFor({{
S32}, {
S64}});
2135 FSHRActionDefs.legalFor({{
S32,
S32}})
2136 .clampMaxNumElementsStrict(0,
S16, 2);
2137 if (ST.hasVOP3PInsts())
2139 FSHRActionDefs.scalarize(0).lower();
2141 if (ST.hasVOP3PInsts()) {
2144 .clampMaxNumElementsStrict(0,
S16, 2)
2168 .clampScalar(1,
S32,
S32)
2177 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2178 G_READ_REGISTER, G_WRITE_REGISTER,
2183 if (ST.hasIEEEMinimumMaximumInsts()) {
2185 .legalFor(FPTypesPK16)
2188 }
else if (ST.hasVOP3PInsts()) {
2191 .clampMaxNumElementsStrict(0,
S16, 2)
2207 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2208 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2214 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2215 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2216 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2217 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2223 verify(*ST.getInstrInfo());
2232 switch (
MI.getOpcode()) {
2233 case TargetOpcode::G_ADDRSPACE_CAST:
2235 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2237 case TargetOpcode::G_FCEIL:
2239 case TargetOpcode::G_FREM:
2241 case TargetOpcode::G_INTRINSIC_TRUNC:
2243 case TargetOpcode::G_SITOFP:
2245 case TargetOpcode::G_UITOFP:
2247 case TargetOpcode::G_FPTOSI:
2249 case TargetOpcode::G_FPTOUI:
2251 case TargetOpcode::G_FMINNUM:
2252 case TargetOpcode::G_FMAXNUM:
2253 case TargetOpcode::G_FMINIMUMNUM:
2254 case TargetOpcode::G_FMAXIMUMNUM:
2256 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2258 case TargetOpcode::G_INSERT_VECTOR_ELT:
2260 case TargetOpcode::G_FSIN:
2261 case TargetOpcode::G_FCOS:
2263 case TargetOpcode::G_GLOBAL_VALUE:
2265 case TargetOpcode::G_LOAD:
2266 case TargetOpcode::G_SEXTLOAD:
2267 case TargetOpcode::G_ZEXTLOAD:
2269 case TargetOpcode::G_STORE:
2271 case TargetOpcode::G_FMAD:
2273 case TargetOpcode::G_FDIV:
2275 case TargetOpcode::G_FFREXP:
2277 case TargetOpcode::G_FSQRT:
2279 case TargetOpcode::G_UDIV:
2280 case TargetOpcode::G_UREM:
2281 case TargetOpcode::G_UDIVREM:
2283 case TargetOpcode::G_SDIV:
2284 case TargetOpcode::G_SREM:
2285 case TargetOpcode::G_SDIVREM:
2287 case TargetOpcode::G_ATOMIC_CMPXCHG:
2289 case TargetOpcode::G_FLOG2:
2291 case TargetOpcode::G_FLOG:
2292 case TargetOpcode::G_FLOG10:
2294 case TargetOpcode::G_FEXP2:
2296 case TargetOpcode::G_FEXP:
2297 case TargetOpcode::G_FEXP10:
2299 case TargetOpcode::G_FPOW:
2301 case TargetOpcode::G_FFLOOR:
2303 case TargetOpcode::G_BUILD_VECTOR:
2304 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2306 case TargetOpcode::G_MUL:
2308 case TargetOpcode::G_CTLZ:
2309 case TargetOpcode::G_CTTZ:
2311 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2313 case TargetOpcode::G_STACKSAVE:
2315 case TargetOpcode::G_GET_FPENV:
2317 case TargetOpcode::G_SET_FPENV:
2319 case TargetOpcode::G_TRAP:
2321 case TargetOpcode::G_DEBUGTRAP:
2341 if (ST.hasApertureRegs()) {
2346 ? AMDGPU::SRC_SHARED_BASE
2347 : AMDGPU::SRC_PRIVATE_BASE;
2348 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2349 !ST.hasGloballyAddressableScratch()) &&
2350 "Cannot use src_private_base with globally addressable scratch!");
2352 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2353 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2354 return B.buildUnmerge(
S32, Dst).getReg(1);
2357 Register LoadAddr =
MRI.createGenericVirtualRegister(
2369 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2371 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2385 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2388 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2391 Register QueuePtr =
MRI.createGenericVirtualRegister(
2410 B.buildObjectPtrOffset(
2412 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2413 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2421 switch (Def->getOpcode()) {
2422 case AMDGPU::G_FRAME_INDEX:
2423 case AMDGPU::G_GLOBAL_VALUE:
2424 case AMDGPU::G_BLOCK_ADDR:
2426 case AMDGPU::G_CONSTANT: {
2427 const ConstantInt *CI = Def->getOperand(1).getCImm();
2444 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2446 Intrinsic::amdgcn_addrspacecast_nonnull));
2451 :
MI.getOperand(1).getReg();
2452 LLT DstTy =
MRI.getType(Dst);
2453 LLT SrcTy =
MRI.getType(Src);
2455 unsigned SrcAS = SrcTy.getAddressSpace();
2465 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2472 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2474 ST.hasGloballyAddressableScratch()) {
2478 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2480 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2481 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2483 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2485 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2489 return B.buildExtract(Dst, Src, 0).getReg(0);
2495 castFlatToLocalOrPrivate(Dst);
2496 MI.eraseFromParent();
2502 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2503 auto FlatNull =
B.buildConstant(SrcTy, 0);
2506 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2510 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2512 MI.eraseFromParent();
2519 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2522 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2525 ST.hasGloballyAddressableScratch()) {
2530 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2534 if (ST.isWave64()) {
2535 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2541 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2542 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2544 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2548 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2549 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2551 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2552 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2561 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2567 castLocalOrPrivateToFlat(Dst);
2568 MI.eraseFromParent();
2572 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2578 SegmentNull.getReg(0));
2580 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2582 MI.eraseFromParent();
2587 SrcTy.getSizeInBits() == 64) {
2589 B.buildExtract(Dst, Src, 0);
2590 MI.eraseFromParent();
2597 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2598 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2599 if (AddrHiVal == 0) {
2601 B.buildIntToPtr(Dst, Zext);
2603 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2604 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2607 MI.eraseFromParent();
2614 MI.eraseFromParent();
2622 LLT Ty =
MRI.getType(Src);
2623 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2628 auto C1 =
B.buildFConstant(Ty, C1Val);
2629 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2632 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2633 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2635 auto C2 =
B.buildFConstant(Ty, C2Val);
2636 auto Fabs =
B.buildFAbs(Ty, Src);
2639 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2640 MI.eraseFromParent();
2658 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2660 const auto Zero =
B.buildFConstant(
S64, 0.0);
2661 const auto One =
B.buildFConstant(
S64, 1.0);
2664 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2665 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2668 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2669 MI.eraseFromParent();
2677 Register Src0Reg =
MI.getOperand(1).getReg();
2678 Register Src1Reg =
MI.getOperand(2).getReg();
2679 auto Flags =
MI.getFlags();
2680 LLT Ty =
MRI.getType(DstReg);
2682 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2683 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2684 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2685 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2686 MI.eraseFromParent();
2692 const unsigned FractBits = 52;
2693 const unsigned ExpBits = 11;
2696 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2697 auto Const1 =
B.buildConstant(
S32, ExpBits);
2699 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2701 .addUse(Const0.getReg(0))
2702 .addUse(Const1.getReg(0));
2704 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2718 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2725 const unsigned FractBits = 52;
2728 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2729 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2731 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2733 const auto Zero32 =
B.buildConstant(
S32, 0);
2736 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2738 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2739 auto Not =
B.buildNot(
S64, Shr);
2740 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2741 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2746 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2747 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2748 MI.eraseFromParent();
2764 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2765 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2767 if (
MRI.getType(Dst) ==
S64) {
2768 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2769 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2771 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2772 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2775 B.buildFAdd(Dst, LdExp, CvtLo);
2776 MI.eraseFromParent();
2782 auto One =
B.buildConstant(
S32, 1);
2786 auto ThirtyOne =
B.buildConstant(
S32, 31);
2787 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2788 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2789 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2790 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2791 .addUse(Unmerge.getReg(1));
2792 auto LS2 =
B.buildSub(
S32, LS, One);
2793 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2795 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2796 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2797 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2798 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2799 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2800 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2801 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2802 B.buildFLdexp(Dst, FVal, Scale);
2803 MI.eraseFromParent();
2820 const LLT SrcLT =
MRI.getType(Src);
2823 unsigned Flags =
MI.getFlags();
2834 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2842 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2843 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2847 K0 =
B.buildFConstant(
2849 K1 =
B.buildFConstant(
2852 K0 =
B.buildFConstant(
2854 K1 =
B.buildFConstant(
2858 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2859 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2860 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2863 :
B.buildFPTOUI(
S32, FloorMul);
2864 auto Lo =
B.buildFPTOUI(
S32, Fma);
2868 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2870 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2873 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2874 MI.eraseFromParent();
2901 LLT VecTy =
MRI.getType(Vec);
2914 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2915 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2916 B.buildIntToPtr(Dst, IntElt);
2918 MI.eraseFromParent();
2925 std::optional<ValueAndVReg> MaybeIdxVal =
2929 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2932 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2933 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2938 MI.eraseFromParent();
2953 LLT VecTy =
MRI.getType(Vec);
2967 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2968 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2969 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2971 B.buildIntToPtr(Dst, IntVecDest);
2972 MI.eraseFromParent();
2979 std::optional<ValueAndVReg> MaybeIdxVal =
2984 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2987 if (IdxVal < NumElts) {
2989 for (
unsigned i = 0; i < NumElts; ++i)
2990 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2991 B.buildUnmerge(SrcRegs, Vec);
2993 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2994 B.buildMergeLikeInstr(Dst, SrcRegs);
2999 MI.eraseFromParent();
3009 LLT Ty =
MRI.getType(DstReg);
3010 unsigned Flags =
MI.getFlags();
3014 if (ST.hasTrigReducedRange()) {
3015 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3016 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3017 .addUse(MulVal.getReg(0))
3021 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3024 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3028 MI.eraseFromParent();
3036 unsigned GAFlags)
const {
3065 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3067 if (ST.has64BitLiterals()) {
3071 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3075 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3084 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3085 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3088 B.buildExtract(DstReg, PCReg, 0);
3098 if (RequiresHighHalf && ST.has64BitLiterals()) {
3099 if (!
MRI.getRegClassOrNull(DstReg))
3100 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3101 B.buildInstr(AMDGPU::S_MOV_B64)
3111 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
3113 :
MRI.createGenericVirtualRegister(
S32);
3115 if (!
MRI.getRegClassOrNull(AddrLo))
3116 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3119 B.buildInstr(AMDGPU::S_MOV_B32)
3124 if (RequiresHighHalf) {
3126 "Must provide a 64-bit pointer type!");
3129 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3131 B.buildInstr(AMDGPU::S_MOV_B32)
3141 if (!
MRI.getRegClassOrNull(AddrDst))
3142 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3144 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3148 if (AddrDst != DstReg)
3149 B.buildCast(DstReg, AddrDst);
3150 }
else if (AddrLo != DstReg) {
3153 B.buildCast(DstReg, AddrLo);
3161 LLT Ty =
MRI.getType(DstReg);
3162 unsigned AS = Ty.getAddressSpace();
3170 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3174 Fn,
"local memory global used by non-kernel function",
3183 B.buildUndef(DstReg);
3184 MI.eraseFromParent();
3208 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3209 B.buildIntToPtr(DstReg, Sz);
3210 MI.eraseFromParent();
3216 MI.eraseFromParent();
3220 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3222 MI.eraseFromParent();
3230 MI.eraseFromParent();
3236 MI.eraseFromParent();
3241 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
3252 if (Ty.getSizeInBits() == 32) {
3254 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3255 B.buildExtract(DstReg, Load, 0);
3257 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3259 MI.eraseFromParent();
3277 LLT PtrTy =
MRI.getType(PtrReg);
3282 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3284 MI.getOperand(1).setReg(Cast.getReg(0));
3289 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3293 LLT ValTy =
MRI.getType(ValReg);
3303 const unsigned ValSize = ValTy.getSizeInBits();
3315 if (WideMemSize == ValSize) {
3321 MI.setMemRefs(MF, {WideMMO});
3327 if (ValSize > WideMemSize)
3334 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3335 B.buildTrunc(ValReg, WideLoad).getReg(0);
3342 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3343 B.buildExtract(ValReg, WideLoad, 0);
3347 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3348 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3352 MI.eraseFromParent();
3365 Register DataReg =
MI.getOperand(0).getReg();
3366 LLT DataTy =
MRI.getType(DataReg);
3380 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3409 "this should not have been custom lowered");
3411 LLT ValTy =
MRI.getType(CmpVal);
3414 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3416 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3420 .setMemRefs(
MI.memoperands());
3422 MI.eraseFromParent();
3430 switch (
DefMI->getOpcode()) {
3431 case TargetOpcode::G_INTRINSIC: {
3433 case Intrinsic::amdgcn_frexp_mant:
3434 case Intrinsic::amdgcn_log:
3435 case Intrinsic::amdgcn_log_clamp:
3436 case Intrinsic::amdgcn_exp2:
3437 case Intrinsic::amdgcn_sqrt:
3445 case TargetOpcode::G_FSQRT:
3447 case TargetOpcode::G_FFREXP: {
3448 if (
DefMI->getOperand(0).getReg() == Src)
3452 case TargetOpcode::G_FPEXT: {
3473std::pair<Register, Register>
3475 unsigned Flags)
const {
3480 auto SmallestNormal =
B.buildFConstant(
3482 auto IsLtSmallestNormal =
3485 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3486 auto One =
B.buildFConstant(
F32, 1.0);
3488 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3489 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3491 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3504 LLT Ty =
B.getMRI()->getType(Dst);
3505 unsigned Flags =
MI.getFlags();
3510 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3511 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3512 .addUse(Ext.getReg(0))
3514 B.buildFPTrunc(Dst,
Log2, Flags);
3515 MI.eraseFromParent();
3523 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3526 MI.eraseFromParent();
3530 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3531 .addUse(ScaledInput)
3534 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3535 auto Zero =
B.buildFConstant(Ty, 0.0);
3537 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3538 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3540 MI.eraseFromParent();
3546 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3547 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3552 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3553 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3558 unsigned Flags =
MI.getFlags();
3559 const LLT Ty =
MRI.getType(
X);
3571 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3573 B.buildFPTrunc(Dst, LogVal);
3578 MI.eraseFromParent();
3587 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3590 if (ST.hasFastFMAF32()) {
3592 const float c_log10 = 0x1.344134p-2f;
3593 const float cc_log10 = 0x1.09f79ep-26f;
3596 const float c_log = 0x1.62e42ep-1f;
3597 const float cc_log = 0x1.efa39ep-25f;
3599 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3600 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3604 R =
B.buildFMul(Ty,
Y,
C, NewFlags).getReg(0);
3605 auto NegR =
B.buildFNeg(Ty, R, NewFlags);
3606 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, NewFlags);
3607 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, NewFlags);
3608 R =
B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3611 const float ch_log10 = 0x1.344000p-2f;
3612 const float ct_log10 = 0x1.3509f6p-18f;
3615 const float ch_log = 0x1.62e000p-1f;
3616 const float ct_log = 0x1.0bfbe8p-15f;
3618 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3619 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3621 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3622 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3623 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3627 auto YTCT =
B.buildFMul(Ty, YT, CT, NewFlags);
3630 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3632 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, NewFlags);
3635 const bool IsFiniteOnly =
3638 if (!IsFiniteOnly) {
3641 auto Fabs =
B.buildFAbs(Ty,
Y);
3644 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3648 auto Zero =
B.buildFConstant(Ty, 0.0);
3650 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3651 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3652 B.buildFSub(Dst, R, Shift, Flags);
3654 B.buildCopy(Dst, R);
3657 MI.eraseFromParent();
3663 unsigned Flags)
const {
3664 const double Log2BaseInverted =
3667 LLT Ty =
B.getMRI()->getType(Dst);
3672 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3675 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3676 auto Zero =
B.buildFConstant(Ty, 0.0);
3678 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3679 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3681 if (ST.hasFastFMAF32())
3682 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3684 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3685 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3693 ?
B.buildFLog2(Ty, Src, Flags)
3694 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3697 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3698 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3709 unsigned Flags =
MI.getFlags();
3710 LLT Ty =
B.getMRI()->getType(Dst);
3720 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3721 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3722 .addUse(Ext.getReg(0))
3724 B.buildFPTrunc(Dst,
Log2, Flags);
3725 MI.eraseFromParent();
3735 MI.eraseFromParent();
3743 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3745 RangeCheckConst, Flags);
3747 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3748 auto Zero =
B.buildFConstant(Ty, 0.0);
3749 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3750 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3752 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3753 .addUse(AddInput.getReg(0))
3756 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3757 auto One =
B.buildFConstant(Ty, 1.0);
3758 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3759 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3760 MI.eraseFromParent();
3765 const SrcOp &Src,
unsigned Flags) {
3766 LLT Ty = Dst.getLLTTy(*
B.getMRI());
3769 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3770 .addUse(Src.getReg())
3773 return B.buildFExp2(Dst, Src, Flags);
3779 bool IsExp10)
const {
3780 LLT Ty =
B.getMRI()->getType(
X);
3784 auto Const =
B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f :
numbers::log2e);
3785 auto Mul =
B.buildFMul(Ty,
X, Const, Flags);
3792 LLT Ty =
B.getMRI()->getType(Dst);
3799 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3802 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3803 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3804 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3807 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3809 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3810 .addUse(ExpInput.getReg(0))
3813 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3814 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3815 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3821 unsigned Flags)
const {
3822 LLT Ty =
B.getMRI()->getType(Dst);
3827 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3828 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3830 auto Mul1 =
B.buildFMul(Ty,
X, K1, Flags);
3831 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3832 auto Mul0 =
B.buildFMul(Ty,
X, K0, Flags);
3833 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3834 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3844 auto Threshold =
B.buildFConstant(Ty, -0x1.2f7030p+5f);
3848 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+5f);
3849 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3850 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X);
3852 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3853 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3855 auto Mul1 =
B.buildFMul(Ty, AdjustedX, K1, Flags);
3856 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3857 auto Mul0 =
B.buildFMul(Ty, AdjustedX, K0, Flags);
3858 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3860 auto MulExps =
B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3861 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.9f623ep-107f);
3862 auto AdjustedResult =
B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3864 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3883 if (
MI.getOpcode() == TargetOpcode::G_FEXP2) {
3885 Dn =
B.buildFRint(
S64,
X, Flags).getReg(0);
3887 F =
B.buildFSub(
S64,
X, Dn, Flags).getReg(0);
3889 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
3890 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
3891 auto Mul2 =
B.buildFMul(
S64,
F, C2, Flags).getReg(0);
3892 T =
B.buildFMA(
S64,
F, C1, Mul2, Flags).getReg(0);
3894 }
else if (
MI.getOpcode() == TargetOpcode::G_FEXP10) {
3895 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.a934f0979a371p+1));
3896 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
3897 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
3899 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
3900 auto C2 =
B.buildFConstant(
S64,
APFloat(-0x1.9dc1da994fd21p-59));
3901 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.34413509f79ffp-2));
3902 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
3903 F =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
3905 auto C4 =
B.buildFConstant(
S64,
APFloat(0x1.26bb1bbb55516p+1));
3906 auto C5 =
B.buildFConstant(
S64,
APFloat(-0x1.f48ad494ea3e9p-53));
3907 auto MulF =
B.buildFMul(
S64,
F, C5, Flags).getReg(0);
3908 T =
B.buildFMA(
S64,
F, C4, MulF, Flags).getReg(0);
3911 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.71547652b82fep+0));
3912 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
3913 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
3915 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
3916 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
3917 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
3918 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
3919 T =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
3923 auto P =
B.buildFConstant(
S64, 0x1.ade156a5dcb37p-26);
3924 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.28af3fca7ab0cp-22),
3926 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.71dee623fde64p-19),
3928 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01997c89e6b0p-16),
3930 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01a014761f6ep-13),
3932 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.6c16c1852b7b0p-10),
3934 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.1111111122322p-7), Flags);
3935 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.55555555502a1p-5), Flags);
3936 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.5555555555511p-3), Flags);
3937 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.000000000000bp-1), Flags);
3939 auto One =
B.buildFConstant(
S64, 1.0);
3940 P =
B.buildFMA(
S64,
T,
P, One, Flags);
3941 P =
B.buildFMA(
S64,
T,
P, One, Flags);
3944 auto DnInt =
B.buildFPTOSI(
S32, Dn);
3945 auto Z =
B.buildFLdexp(
S64,
P, DnInt, Flags);
3952 Z =
B.buildSelect(
S64, CondHi, Z, PInf, Flags);
3959 B.buildSelect(
MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
3961 MI.eraseFromParent();
3969 const unsigned Flags =
MI.getFlags();
3972 LLT Ty =
MRI.getType(Dst);
3981 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3989 MI.eraseFromParent();
4000 auto Ext =
B.buildFPExt(
F32,
X, Flags);
4003 B.buildFPTrunc(Dst, Lowered, Flags);
4004 MI.eraseFromParent();
4015 MI.eraseFromParent();
4043 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
4046 if (ST.hasFastFMAF32()) {
4048 const float cc_exp = 0x1.4ae0bep-26f;
4049 const float c_exp10 = 0x1.a934f0p+1f;
4050 const float cc_exp10 = 0x1.2f346ep-24f;
4052 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4053 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
4054 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
4055 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
4057 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4058 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
4060 const float ch_exp = 0x1.714000p+0f;
4061 const float cl_exp = 0x1.47652ap-12f;
4063 const float ch_exp10 = 0x1.a92000p+1f;
4064 const float cl_exp10 = 0x1.4f0978p-11f;
4066 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
4067 auto XH =
B.buildAnd(Ty,
X, MaskConst);
4068 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
4070 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4071 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
4073 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4074 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
4077 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
4078 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4081 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
4084 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
4085 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
4088 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4089 .addUse(
A.getReg(0))
4091 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
4093 auto UnderflowCheckConst =
4094 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4095 auto Zero =
B.buildFConstant(Ty, 0.0);
4099 R =
B.buildSelect(Ty, Underflow, Zero, R);
4102 auto OverflowCheckConst =
4103 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4108 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
4111 B.buildCopy(Dst, R);
4112 MI.eraseFromParent();
4121 unsigned Flags =
MI.getFlags();
4122 LLT Ty =
B.getMRI()->getType(Dst);
4127 auto Log =
B.buildFLog2(
F32, Src0, Flags);
4128 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4129 .addUse(Log.getReg(0))
4132 B.buildFExp2(Dst,
Mul, Flags);
4133 }
else if (Ty == F16) {
4135 auto Log =
B.buildFLog2(F16, Src0, Flags);
4136 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
4137 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
4138 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4139 .addUse(Ext0.getReg(0))
4140 .addUse(Ext1.getReg(0))
4142 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
4146 MI.eraseFromParent();
4154 ModSrc = SrcFNeg->getOperand(1).getReg();
4156 ModSrc = SrcFAbs->getOperand(1).getReg();
4158 ModSrc = SrcFAbs->getOperand(1).getReg();
4169 Register OrigSrc =
MI.getOperand(1).getReg();
4170 unsigned Flags =
MI.getFlags();
4172 "this should not have been custom lowered");
4182 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
4202 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4204 B.buildFMinNum(Min, Fract, Const, Flags);
4209 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
4212 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
4213 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4215 MI.eraseFromParent();
4231 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4233 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4234 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4237 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4238 B.buildBitcast(Dst,
Merge);
4240 MI.eraseFromParent();
4257 bool UsePartialMad64_32,
4258 bool SeparateOddAlignedProducts)
const {
4273 auto getZero32 = [&]() ->
Register {
4275 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4278 auto getZero64 = [&]() ->
Register {
4280 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4285 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4296 if (CarryIn.empty())
4299 bool HaveCarryOut =
true;
4301 if (CarryIn.size() == 1) {
4303 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4307 CarryAccum = getZero32();
4309 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4310 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4312 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4317 LocalAccum = getZero32();
4318 HaveCarryOut =
false;
4323 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4324 LocalAccum =
Add.getReg(0);
4338 auto buildMadChain =
4341 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4342 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4349 if (LocalAccum.size() == 1 &&
4350 (!UsePartialMad64_32 || !CarryIn.empty())) {
4353 unsigned j1 = DstIndex - j0;
4354 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4358 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4360 LocalAccum[0] =
Mul.getReg(0);
4362 if (CarryIn.empty()) {
4363 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4366 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4372 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4376 if (j0 <= DstIndex) {
4377 bool HaveSmallAccum =
false;
4380 if (LocalAccum[0]) {
4381 if (LocalAccum.size() == 1) {
4382 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4383 HaveSmallAccum =
true;
4384 }
else if (LocalAccum[1]) {
4385 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4386 HaveSmallAccum =
false;
4388 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4389 HaveSmallAccum =
true;
4392 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4394 HaveSmallAccum =
true;
4398 unsigned j1 = DstIndex - j0;
4399 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4403 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4404 {Src0[j0], Src1[j1], Tmp});
4405 Tmp = Mad.getReg(0);
4406 if (!HaveSmallAccum)
4407 CarryOut.push_back(Mad.getReg(1));
4408 HaveSmallAccum =
false;
4411 }
while (j0 <= DstIndex);
4413 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4414 LocalAccum[0] = Unmerge.getReg(0);
4415 if (LocalAccum.size() > 1)
4416 LocalAccum[1] = Unmerge.getReg(1);
4443 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4444 Carry OddCarryIn = std::move(OddCarry);
4445 Carry EvenCarryIn = std::move(EvenCarry);
4450 if (2 * i < Accum.
size()) {
4451 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4452 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4457 if (!SeparateOddAlignedProducts) {
4458 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4459 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4461 bool IsHighest = 2 * i >= Accum.
size();
4464 .take_front(IsHighest ? 1 : 2);
4465 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4471 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4473 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4475 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4478 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4481 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4482 Lo->getOperand(1).getReg());
4483 Accum[2 * i] =
Hi.getReg(0);
4484 SeparateOddCarry =
Hi.getReg(1);
4491 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4492 EvenCarryIn.push_back(CarryOut);
4494 if (2 * i < Accum.
size()) {
4495 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4496 OddCarry.push_back(CarryOut);
4508 assert(ST.hasMad64_32());
4509 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4518 LLT Ty =
MRI.getType(DstReg);
4521 unsigned Size = Ty.getSizeInBits();
4522 if (ST.hasVectorMulU64() &&
Size == 64)
4525 unsigned NumParts =
Size / 32;
4537 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4541 for (
unsigned i = 0; i < NumParts; ++i) {
4545 B.buildUnmerge(Src0Parts, Src0);
4546 B.buildUnmerge(Src1Parts, Src1);
4549 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4550 SeparateOddAlignedProducts);
4552 B.buildMergeLikeInstr(DstReg, AccumRegs);
4553 MI.eraseFromParent();
4565 LLT DstTy =
MRI.getType(Dst);
4566 LLT SrcTy =
MRI.getType(Src);
4568 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4569 ? AMDGPU::G_AMDGPU_FFBH_U32
4570 : AMDGPU::G_AMDGPU_FFBL_B32;
4571 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4574 MI.eraseFromParent();
4583 LLT SrcTy =
MRI.getType(Src);
4584 TypeSize NumBits = SrcTy.getSizeInBits();
4588 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4589 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4590 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4591 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4592 B.buildTrunc(Dst, Ctlz);
4593 MI.eraseFromParent();
4599 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4602 return ConstVal == -1;
4609 Register CondDef =
MI.getOperand(0).getReg();
4610 if (!
MRI.hasOneNonDBGUse(CondDef))
4618 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4624 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4628 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4637 UncondBrTarget = &*NextMBB;
4639 if (
Next->getOpcode() != AMDGPU::G_BR)
4658 *ArgRC,
B.getDebugLoc(), ArgTy);
4662 const unsigned Mask = Arg->
getMask();
4670 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4671 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4674 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4676 B.buildCopy(DstReg, LiveIn);
4686 if (!ST.hasClusters()) {
4689 MI.eraseFromParent();
4702 Register ClusterMaxIdXYZ =
MRI.createGenericVirtualRegister(
S32);
4703 Register ClusterWorkGroupIdXYZ =
MRI.createGenericVirtualRegister(
S32);
4709 auto One =
B.buildConstant(
S32, 1);
4710 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4711 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4712 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4719 B.buildCopy(DstReg, GlobalIdXYZ);
4720 MI.eraseFromParent();
4724 B.buildCopy(DstReg, ClusterIdXYZ);
4725 MI.eraseFromParent();
4730 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4732 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4733 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4735 .addImm(ClusterIdField);
4736 auto Zero =
B.buildConstant(
S32, 0);
4739 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4740 MI.eraseFromParent();
4782 auto LoadConstant = [&](
unsigned N) {
4783 B.buildConstant(DstReg,
N);
4787 if (ST.hasArchitectedSGPRs() &&
4794 Arg = &WorkGroupIDX;
4795 ArgRC = &AMDGPU::SReg_32RegClass;
4799 Arg = &WorkGroupIDY;
4800 ArgRC = &AMDGPU::SReg_32RegClass;
4804 Arg = &WorkGroupIDZ;
4805 ArgRC = &AMDGPU::SReg_32RegClass;
4809 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4810 return LoadConstant(0);
4811 Arg = &ClusterWorkGroupIDX;
4812 ArgRC = &AMDGPU::SReg_32RegClass;
4816 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
4817 return LoadConstant(0);
4818 Arg = &ClusterWorkGroupIDY;
4819 ArgRC = &AMDGPU::SReg_32RegClass;
4823 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
4824 return LoadConstant(0);
4825 Arg = &ClusterWorkGroupIDZ;
4826 ArgRC = &AMDGPU::SReg_32RegClass;
4831 return LoadConstant(ClusterDims.
getDims()[0] - 1);
4832 Arg = &ClusterWorkGroupMaxIDX;
4833 ArgRC = &AMDGPU::SReg_32RegClass;
4838 return LoadConstant(ClusterDims.
getDims()[1] - 1);
4839 Arg = &ClusterWorkGroupMaxIDY;
4840 ArgRC = &AMDGPU::SReg_32RegClass;
4845 return LoadConstant(ClusterDims.
getDims()[2] - 1);
4846 Arg = &ClusterWorkGroupMaxIDZ;
4847 ArgRC = &AMDGPU::SReg_32RegClass;
4851 Arg = &ClusterWorkGroupMaxFlatID;
4852 ArgRC = &AMDGPU::SReg_32RegClass;
4867 return LoadConstant(0);
4872 B.buildUndef(DstReg);
4876 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4888 MI.eraseFromParent();
4894 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4895 MI.eraseFromParent();
4902 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
4916 B.buildUndef(DstReg);
4917 MI.eraseFromParent();
4921 if (Arg->isMasked()) {
4935 MI.eraseFromParent();
4950 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4959 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
4967 Align Alignment)
const {
4971 "unexpected kernarg parameter type");
4978 MI.eraseFromParent();
4986 LLT DstTy =
MRI.getType(Dst);
5013 auto FloatY =
B.buildUITOFP(
S32,
Y);
5014 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
5016 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
5017 auto Z =
B.buildFPTOUI(
S32, ScaledY);
5020 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
5021 auto NegYZ =
B.buildMul(
S32, NegY, Z);
5022 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
5025 auto Q =
B.buildUMulH(
S32,
X, Z);
5026 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
5029 auto One =
B.buildConstant(
S32, 1);
5032 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
5038 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
5041 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
5060 auto Unmerge =
B.buildUnmerge(
S32, Val);
5062 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
5063 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
5065 auto Mad =
B.buildFMAD(
5069 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
5070 auto Mul1 =
B.buildFMul(
5074 auto Mul2 =
B.buildFMul(
5076 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
5079 auto Mad2 =
B.buildFMAD(
5083 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
5084 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
5086 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5101 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
5103 auto Zero64 =
B.buildConstant(
S64, 0);
5104 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
5106 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
5107 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
5109 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
5110 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5111 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5113 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
5114 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5115 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
5117 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
5118 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
5119 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
5120 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5121 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5123 auto Zero32 =
B.buildConstant(
S32, 0);
5124 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
5125 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5126 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
5128 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
5129 Register NumerLo = UnmergeNumer.getReg(0);
5130 Register NumerHi = UnmergeNumer.getReg(1);
5132 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
5133 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
5134 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
5135 Register Mul3_Lo = UnmergeMul3.getReg(0);
5136 Register Mul3_Hi = UnmergeMul3.getReg(1);
5137 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
5138 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5139 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
5140 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
5142 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
5143 Register DenomLo = UnmergeDenom.getReg(0);
5144 Register DenomHi = UnmergeDenom.getReg(1);
5147 auto C1 =
B.buildSExt(
S32, CmpHi);
5150 auto C2 =
B.buildSExt(
S32, CmpLo);
5153 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
5160 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
5161 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5162 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5163 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
5165 auto One64 =
B.buildConstant(
S64, 1);
5166 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
5172 auto C6 =
B.buildSelect(
5176 auto Add4 =
B.buildAdd(
S64, Add3, One64);
5177 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
5179 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5180 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5181 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
5187 auto Sel1 =
B.buildSelect(
5194 auto Sel2 =
B.buildSelect(
5205 switch (
MI.getOpcode()) {
5208 case AMDGPU::G_UDIV: {
5209 DstDivReg =
MI.getOperand(0).getReg();
5212 case AMDGPU::G_UREM: {
5213 DstRemReg =
MI.getOperand(0).getReg();
5216 case AMDGPU::G_UDIVREM: {
5217 DstDivReg =
MI.getOperand(0).getReg();
5218 DstRemReg =
MI.getOperand(1).getReg();
5225 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5226 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
5227 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5228 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5237 MI.eraseFromParent();
5247 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5248 if (Ty !=
S32 && Ty !=
S64)
5251 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5252 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5253 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5255 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5256 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5257 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5259 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5260 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5262 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5263 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5265 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5266 switch (
MI.getOpcode()) {
5269 case AMDGPU::G_SDIV: {
5270 DstDivReg =
MI.getOperand(0).getReg();
5271 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
5274 case AMDGPU::G_SREM: {
5275 DstRemReg =
MI.getOperand(0).getReg();
5276 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
5279 case AMDGPU::G_SDIVREM: {
5280 DstDivReg =
MI.getOperand(0).getReg();
5281 DstRemReg =
MI.getOperand(1).getReg();
5282 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
5283 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
5294 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5295 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5296 B.buildSub(DstDivReg, SignXor, Sign);
5300 auto Sign = LHSign.getReg(0);
5301 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5302 B.buildSub(DstRemReg, SignXor, Sign);
5305 MI.eraseFromParent();
5316 LLT ResTy =
MRI.getType(Res);
5321 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5332 if (CLHS->isExactlyValue(1.0)) {
5333 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5337 MI.eraseFromParent();
5342 if (CLHS->isExactlyValue(-1.0)) {
5343 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5344 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5345 .addUse(FNeg.getReg(0))
5348 MI.eraseFromParent();
5355 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5360 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5363 B.buildFMul(Res, LHS, RCP, Flags);
5365 MI.eraseFromParent();
5376 LLT ResTy =
MRI.getType(Res);
5380 if (!AllowInaccurateRcp)
5383 auto NegY =
B.buildFNeg(ResTy,
Y);
5384 auto One =
B.buildFConstant(ResTy, 1.0);
5386 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5390 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5391 R =
B.buildFMA(ResTy, Tmp0, R, R);
5393 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5394 R =
B.buildFMA(ResTy, Tmp1, R, R);
5396 auto Ret =
B.buildFMul(ResTy,
X, R);
5397 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5399 B.buildFMA(Res, Tmp2, R, Ret);
5400 MI.eraseFromParent();
5432 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5433 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5434 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5435 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5436 .addUse(RHSExt.getReg(0))
5438 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5440 if (ST.hasMadMacF32Insts()) {
5441 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5442 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5443 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5445 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5446 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5447 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5449 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5450 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5451 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5452 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5453 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5454 .addUse(RDst.getReg(0))
5459 MI.eraseFromParent();
5472 unsigned SPDenormMode =
5475 if (ST.hasDenormModeInst()) {
5477 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5479 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5480 B.buildInstr(AMDGPU::S_DENORM_MODE)
5481 .addImm(NewDenormModeValue);
5484 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5485 .addImm(SPDenormMode)
5507 auto One =
B.buildFConstant(
S32, 1.0f);
5509 auto DenominatorScaled =
5510 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5515 auto NumeratorScaled =
5516 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5522 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5523 .addUse(DenominatorScaled.getReg(0))
5525 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5528 const bool HasDynamicDenormals =
5533 if (!PreservesDenormals) {
5534 if (HasDynamicDenormals) {
5535 SavedSPDenormMode =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5536 B.buildInstr(AMDGPU::S_GETREG_B32)
5537 .addDef(SavedSPDenormMode)
5543 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5544 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5545 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5546 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5547 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5548 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5550 if (!PreservesDenormals) {
5551 if (HasDynamicDenormals) {
5552 assert(SavedSPDenormMode);
5553 B.buildInstr(AMDGPU::S_SETREG_B32)
5554 .addReg(SavedSPDenormMode)
5560 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5561 .addUse(Fma4.getReg(0))
5562 .addUse(Fma1.getReg(0))
5563 .addUse(Fma3.getReg(0))
5564 .addUse(NumeratorScaled.getReg(1))
5567 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5568 .addUse(Fmas.getReg(0))
5573 MI.eraseFromParent();
5592 auto One =
B.buildFConstant(
S64, 1.0);
5594 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5600 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5602 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5603 .addUse(DivScale0.getReg(0))
5606 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5607 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5608 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5610 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5616 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5617 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5618 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5621 if (!ST.hasUsableDivScaleConditionOutput()) {
5627 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5628 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5629 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5630 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5633 Scale1Unmerge.getReg(1));
5635 Scale0Unmerge.getReg(1));
5636 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5638 Scale = DivScale1.getReg(1);
5641 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5642 .addUse(Fma4.getReg(0))
5643 .addUse(Fma3.getReg(0))
5644 .addUse(
Mul.getReg(0))
5648 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5649 .addUse(Fmas.getReg(0))
5654 MI.eraseFromParent();
5666 LLT Ty =
MRI.getType(Res0);
5669 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5672 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5676 if (ST.hasFractBug()) {
5677 auto Fabs =
B.buildFAbs(Ty, Val);
5681 auto Zero =
B.buildConstant(InstrExpTy, 0);
5682 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5683 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5686 B.buildCopy(Res0, Mant);
5687 B.buildSExtOrTrunc(Res1, Exp);
5689 MI.eraseFromParent();
5704 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5707 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5708 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5709 auto C2 =
B.buildFConstant(
S32, 1.0f);
5712 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5714 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5716 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5717 .addUse(Mul0.getReg(0))
5720 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5722 B.buildFMul(Res, Sel, Mul1, Flags);
5724 MI.eraseFromParent();
5733 unsigned Flags =
MI.getFlags();
5734 assert(!ST.has16BitInsts());
5736 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5737 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5738 .addUse(Ext.getReg(0))
5740 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5741 MI.eraseFromParent();
5751 const unsigned Flags =
MI.getFlags();
5760 MI.eraseFromParent();
5764 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5766 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5767 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5768 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5773 .addUse(SqrtX.getReg(0))
5776 auto NegOne =
B.buildConstant(I32, -1);
5777 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5779 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5780 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5782 auto PosOne =
B.buildConstant(I32, 1);
5783 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5785 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5786 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5788 auto Zero =
B.buildFConstant(
F32, 0.0f);
5792 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5796 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5799 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5800 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5802 auto Half =
B.buildFConstant(
F32, 0.5f);
5803 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5804 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5805 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5806 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5807 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5808 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5809 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5810 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5813 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5815 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5817 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5820 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5822 MI.eraseFromParent();
5854 assert(
MRI.getType(Dst) ==
F64 &&
"only expect to lower f64 sqrt");
5857 unsigned Flags =
MI.getFlags();
5859 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
5861 auto ZeroInt =
B.buildConstant(
S32, 0);
5865 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
5866 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
5867 auto SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags);
5870 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX.getReg(0));
5872 auto Half =
B.buildFConstant(
F64, 0.5);
5873 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
5874 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
5876 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
5877 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
5879 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
5880 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
5882 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
5883 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
5885 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
5887 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
5888 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
5890 auto SqrtRet =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
5893 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
5894 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
5895 SqrtRet =
B.buildFLdexp(
F64, SqrtRet, ScaleDown, Flags);
5904 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5906 MI.eraseFromParent();
5913 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5937 auto Flags =
MI.getFlags();
5939 LLT Ty =
MRI.getType(Dst);
5949 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5959 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5960 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5965 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5967 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5968 MI.eraseFromParent();
5980 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5981 IID == Intrinsic::amdgcn_permlanex16;
5982 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5983 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5987 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
5989 case Intrinsic::amdgcn_readfirstlane:
5990 case Intrinsic::amdgcn_permlane64:
5991 return LaneOp.getReg(0);
5992 case Intrinsic::amdgcn_readlane:
5993 case Intrinsic::amdgcn_set_inactive:
5994 case Intrinsic::amdgcn_set_inactive_chain_arg:
5995 return LaneOp.addUse(Src1).getReg(0);
5996 case Intrinsic::amdgcn_writelane:
5997 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5998 case Intrinsic::amdgcn_permlane16:
5999 case Intrinsic::amdgcn_permlanex16: {
6001 int64_t Src4 =
MI.getOperand(6).getImm();
6002 int64_t Src5 =
MI.getOperand(7).getImm();
6003 return LaneOp.addUse(Src1)
6010 case Intrinsic::amdgcn_mov_dpp8:
6011 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
6012 case Intrinsic::amdgcn_update_dpp:
6013 return LaneOp.addUse(Src1)
6014 .addImm(
MI.getOperand(4).getImm())
6015 .addImm(
MI.getOperand(5).getImm())
6016 .addImm(
MI.getOperand(6).getImm())
6017 .addImm(
MI.getOperand(7).getImm())
6027 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6028 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6029 Src1 =
MI.getOperand(3).getReg();
6030 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
6031 Src2 =
MI.getOperand(4).getReg();
6035 LLT Ty =
MRI.getType(DstReg);
6036 unsigned Size = Ty.getSizeInBits();
6038 unsigned SplitSize = 32;
6039 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
6040 ST.hasDPALU_DPP() &&
6044 if (
Size == SplitSize) {
6050 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
6052 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6055 if (IID == Intrinsic::amdgcn_writelane)
6058 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
6059 B.buildTrunc(DstReg, LaneOpDst);
6060 MI.eraseFromParent();
6064 if (
Size % SplitSize != 0)
6068 bool NeedsBitcast =
false;
6069 if (Ty.isVector()) {
6072 if (EltSize == SplitSize) {
6073 PartialResTy = EltTy;
6074 }
else if (EltSize == 16 || EltSize == 32) {
6075 unsigned NElem = SplitSize / EltSize;
6079 NeedsBitcast =
true;
6084 unsigned NumParts =
Size / SplitSize;
6088 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6089 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
6091 if (IID == Intrinsic::amdgcn_writelane)
6092 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
6094 for (
unsigned i = 0; i < NumParts; ++i) {
6095 Src0 = Src0Parts.
getReg(i);
6097 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6098 Src1 = Src1Parts.
getReg(i);
6100 if (IID == Intrinsic::amdgcn_writelane)
6101 Src2 = Src2Parts.
getReg(i);
6103 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6107 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
6110 B.buildMergeLikeInstr(DstReg, PartialRes);
6112 MI.eraseFromParent();
6120 ST.getTargetLowering()->getImplicitParameterOffset(
6122 LLT DstTy =
MRI.getType(DstReg);
6125 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
6130 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6131 B.buildConstant(IdxTy,
Offset).getReg(0));
6142 Register Pointer =
MI.getOperand(2).getReg();
6144 Register NumRecords =
MI.getOperand(4).getReg();
6150 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6152 auto ExtStride =
B.buildAnyExt(
S32, Stride);
6154 if (ST.has45BitNumRecordsBufferResource()) {
6159 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
6160 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
6161 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
6162 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
6166 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
6167 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
6168 auto ExtShiftedStride =
6169 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
6170 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
6171 auto ExtShiftedFlags =
6172 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
6173 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
6175 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
6176 B.buildMergeValues(Result, {LowHalf, HighHalf});
6178 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
6179 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
6180 auto LowHalf = Unmerge.getReg(0);
6181 auto HighHalf = Unmerge.getReg(1);
6183 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
6184 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
6185 auto ShiftConst =
B.buildConstant(
S32, 16);
6186 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
6187 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
6188 Register NewHighHalfReg = NewHighHalf.getReg(0);
6189 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6192 MI.eraseFromParent();
6209 MI.eraseFromParent();
6217 std::optional<uint32_t> KnownSize =
6219 if (KnownSize.has_value())
6220 B.buildConstant(DstReg, *KnownSize);
6238 MI.eraseFromParent();
6245 unsigned AddrSpace)
const {
6247 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6251 ST.hasGloballyAddressableScratch()) {
6253 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6254 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6256 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6258 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6260 B.buildConstant(
S32, 1u << 26));
6265 MI.eraseFromParent();
6275std::pair<Register, unsigned>
6287 bool CheckNUW = ST.hasGFX1250Insts();
6289 MRI, OrigOffset,
nullptr, CheckNUW);
6292 if (
MRI.getType(BaseReg).isPointer())
6293 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
6303 unsigned Overflow = ImmOffset & ~MaxImm;
6304 ImmOffset -= Overflow;
6305 if ((int32_t)Overflow < 0) {
6306 Overflow += ImmOffset;
6310 if (Overflow != 0) {
6312 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6314 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6315 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6320 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6322 return std::pair(BaseReg, ImmOffset);
6329 bool ImageStore)
const {
6332 LLT StoreVT =
MRI.getType(Reg);
6335 if (ST.hasUnpackedD16VMem()) {
6336 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6339 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6340 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6348 if (ImageStore && ST.hasImageStoreD16Bug()) {
6351 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6353 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6360 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6361 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6363 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6371 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6372 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6374 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6391 bool IsFormat)
const {
6393 LLT Ty =
MRI->getType(VData);
6403 VData =
B.buildBitcast(Ty, VData).getReg(0);
6411 if (Ty.isVector()) {
6412 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6424 bool IsFormat)
const {
6429 LLT Ty =
MRI.getType(VData);
6431 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6446 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6449 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6453 VIndex =
MI.getOperand(3).getReg();
6456 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6459 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6460 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6464 Format =
MI.getOperand(5 + OpOffset).getImm();
6468 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6474 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6475 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6476 }
else if (IsFormat) {
6477 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6478 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6482 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6485 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6488 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6493 auto MIB =
B.buildInstr(
Opc)
6504 MIB.addImm(AuxiliaryData)
6505 .addImm(HasVIndex ? -1 : 0)
6506 .addMemOperand(MMO);
6508 MI.eraseFromParent();
6514 unsigned ImmOffset,
unsigned Format,
6517 auto MIB =
B.buildInstr(
Opc)
6528 MIB.addImm(AuxiliaryData)
6529 .addImm(HasVIndex ? -1 : 0)
6530 .addMemOperand(MMO);
6536 bool IsTyped)
const {
6550 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6551 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6553 StatusDst =
MI.getOperand(1).getReg();
6558 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6561 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6564 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6567 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6570 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6573 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6574 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6578 Format =
MI.getOperand(5 + OpOffset).getImm();
6582 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6585 LLT Ty =
MRI.getType(Dst);
6592 Dst =
MI.getOperand(0).getReg();
6593 B.setInsertPt(
B.getMBB(),
MI);
6600 Dst =
MI.getOperand(0).getReg();
6601 B.setInsertPt(
B.getMBB(),
MI);
6605 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6606 const bool Unpacked = ST.hasUnpackedD16VMem();
6616 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6617 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6618 }
else if (IsFormat) {
6622 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6624 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6625 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6630 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6631 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6634 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6635 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6638 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6639 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6645 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6646 unsigned NumLoadDWords = NumValueDWords + 1;
6648 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6650 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6652 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6653 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6654 B.buildTrunc(Dst, ExtDst);
6655 }
else if (NumValueDWords == 1) {
6656 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6659 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6660 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6662 B.buildUnmerge(LoadElts, LoadDstReg);
6664 B.buildMergeLikeInstr(Dst, LoadElts);
6667 (IsD16 && !Ty.isVector())) {
6668 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6670 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6671 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6672 B.buildTrunc(Dst, LoadDstReg);
6673 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6675 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6677 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6678 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6680 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6682 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6683 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6684 B.buildMergeLikeInstr(Dst, Repack);
6687 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6690 MI.eraseFromParent();
6696 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6697 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6698 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6699 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6700 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6701 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6702 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6703 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6704 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6705 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6706 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6707 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6708 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6709 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6710 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6711 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6712 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6713 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6714 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6715 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6716 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6717 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6718 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6719 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6720 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6721 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6722 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6723 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6724 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6725 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6726 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6727 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6728 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6729 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6730 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6731 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6732 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6733 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6734 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6735 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6736 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6737 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6738 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6739 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6740 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6741 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6742 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6743 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6744 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6745 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6746 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6747 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6748 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6749 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6750 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6751 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6752 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6753 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6754 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6755 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6756 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6757 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6758 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6759 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6760 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6761 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6762 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6763 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6764 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6765 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6766 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6767 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6768 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6769 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6770 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6771 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6772 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6773 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6774 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6775 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6776 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6777 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6778 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6779 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6780 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6781 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6782 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6783 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6784 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6785 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6794 const bool IsCmpSwap =
6795 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6796 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6797 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6798 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6809 CmpVal =
MI.getOperand(3).getReg();
6814 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6815 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6818 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6821 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6824 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6827 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6828 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6829 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6848 .addImm(AuxiliaryData)
6849 .addImm(HasVIndex ? -1 : 0)
6850 .addMemOperand(MMO);
6852 MI.eraseFromParent();
6862 bool IsA16,
bool IsG16) {
6878 (
B.getMRI()->getType(AddrReg) ==
S16)) {
6883 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6887 "Bias needs to be converted to 16 bit in A16 mode");
6889 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
6895 if (((
I + 1) >= EndIdx) ||
6902 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
6904 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6909 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6920 int DimIdx,
int NumVAddrs) {
6924 for (
int I = 0;
I != NumVAddrs; ++
I) {
6926 if (
SrcOp.isReg()) {
6932 int NumAddrRegs = AddrRegs.
size();
6933 if (NumAddrRegs != 1) {
6936 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6939 for (
int I = 1;
I != NumVAddrs; ++
I) {
6942 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
6964 const unsigned NumDefs =
MI.getNumExplicitDefs();
6965 const unsigned ArgOffset = NumDefs + 1;
6966 bool IsTFE = NumDefs == 2;
6984 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6985 Ty =
MRI->getType(VData);
6988 const bool IsAtomicPacked16Bit =
6989 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6990 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6998 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
6999 const bool IsA16 = AddrTy ==
S16;
7000 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
7003 if (!BaseOpcode->
Atomic) {
7004 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
7007 }
else if (DMask != 0) {
7009 }
else if (!IsTFE && !BaseOpcode->
Store) {
7011 B.buildUndef(
MI.getOperand(0));
7012 MI.eraseFromParent();
7020 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7021 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7022 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7023 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7024 unsigned NewOpcode = LoadOpcode;
7025 if (BaseOpcode->
Store)
7026 NewOpcode = StoreOpcode;
7028 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7031 MI.setDesc(
B.getTII().get(NewOpcode));
7035 if (IsTFE && DMask == 0) {
7038 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
7041 if (BaseOpcode->
Atomic) {
7043 LLT Ty =
MRI->getType(VData0);
7046 if (Ty.isVector() && !IsAtomicPacked16Bit)
7053 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
7054 MI.getOperand(2).setReg(
Concat.getReg(0));
7055 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7059 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
7062 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7068 if (IsA16 && !ST.hasA16()) {
7073 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
7074 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7076 if (IsA16 || IsG16) {
7084 const bool UseNSA = ST.hasNSAEncoding() &&
7085 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
7086 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
7087 const bool UsePartialNSA =
7088 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
7090 if (UsePartialNSA) {
7094 auto Concat =
B.buildConcatVectors(
7095 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7096 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
7097 PackedRegs.
resize(NSAMaxSize);
7098 }
else if (!UseNSA && PackedRegs.
size() > 1) {
7100 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
7101 PackedRegs[0] =
Concat.getReg(0);
7105 const unsigned NumPacked = PackedRegs.
size();
7108 if (!
SrcOp.isReg()) {
7118 SrcOp.setReg(AMDGPU::NoRegister);
7135 const bool UseNSA = ST.hasNSAEncoding() &&
7136 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7137 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7138 const bool UsePartialNSA =
7139 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7141 if (UsePartialNSA) {
7143 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
7145 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
7160 if (!Ty.isVector() || !IsD16)
7164 if (RepackedReg != VData) {
7165 MI.getOperand(1).setReg(RepackedReg);
7173 const int NumElts = Ty.
isVector() ? Ty.getNumElements() : 1;
7176 if (NumElts < DMaskLanes)
7179 if (NumElts > 4 || DMaskLanes > 4)
7189 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7190 const LLT AdjustedTy =
7206 if (IsD16 && ST.hasUnpackedD16VMem()) {
7213 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
7214 unsigned RoundedSize = 32 * RoundedElts;
7218 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
7223 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
7229 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
7233 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7234 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
7236 Register NewResultReg =
MRI->createGenericVirtualRegister(LoadResultTy);
7238 MI.getOperand(0).setReg(NewResultReg);
7246 Dst1Reg =
MI.getOperand(1).getReg();
7247 if (
MRI->getType(Dst1Reg) !=
S32)
7251 MI.removeOperand(1);
7255 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7264 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7266 if (ResultNumRegs == 1) {
7268 ResultRegs[0] = NewResultReg;
7271 for (
int I = 0;
I != NumDataRegs; ++
I)
7272 ResultRegs[
I] =
MRI->createGenericVirtualRegister(RegTy);
7273 B.buildUnmerge(ResultRegs, NewResultReg);
7278 ResultRegs.
resize(NumDataRegs);
7283 if (IsD16 && !Ty.isVector()) {
7284 B.buildTrunc(DstReg, ResultRegs[0]);
7289 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7290 B.buildBitcast(DstReg, ResultRegs[0]);
7302 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7304 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7305 }
else if (ST.hasUnpackedD16VMem()) {
7307 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7311 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7315 for (
int I = 0;
I != NumElts; ++
I)
7320 LLT ResTy =
MRI->getType(ResultRegs[0]);
7322 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7323 B.buildBuildVector(DstReg, ResultRegs);
7327 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7328 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7334 if (ResultRegs.
size() == 1) {
7335 NewResultReg = ResultRegs[0];
7336 }
else if (ResultRegs.
size() == 2) {
7338 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7344 if (
MRI->getType(DstReg).getNumElements() <
7345 MRI->getType(NewResultReg).getNumElements()) {
7346 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7348 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7353 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7354 B.buildConcatVectors(DstReg, ResultRegs);
7363 Register OrigDst =
MI.getOperand(0).getReg();
7365 LLT Ty =
B.getMRI()->getType(OrigDst);
7366 unsigned Size = Ty.getSizeInBits();
7369 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7371 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7372 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7375 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7377 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7386 B.setInsertPt(
B.getMBB(),
MI);
7391 B.setInsertPt(
B.getMBB(),
MI);
7397 MI.setDesc(
B.getTII().get(
Opc));
7398 MI.removeOperand(1);
7401 const unsigned MemSize = (
Size + 7) / 8;
7402 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7409 MI.addMemOperand(MF, MMO);
7410 if (Dst != OrigDst) {
7411 MI.getOperand(0).setReg(Dst);
7412 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7413 B.buildTrunc(OrigDst, Dst);
7435 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7436 MI.removeOperand(0);
7446 if (!ST.hasTrapHandler() ||
7450 return ST.supportsGetDoorbellID() ?
7463 MI.eraseFromParent();
7473 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7475 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7479 MI.eraseFromParent();
7488 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7495 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7497 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
7513 Register LoadAddr =
MRI.createGenericVirtualRegister(
7515 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7518 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7519 B.buildCopy(SGPR01, Temp);
7520 B.buildInstr(AMDGPU::S_TRAP)
7523 MI.eraseFromParent();
7534 B.buildCopy(SGPR01, LiveIn);
7535 B.buildInstr(AMDGPU::S_TRAP)
7539 MI.eraseFromParent();
7548 if (ST.hasPrivEnabledTrap2NopBug()) {
7549 ST.getInstrInfo()->insertSimulatedTrap(
MRI,
B.getMBB(),
MI,
7551 MI.eraseFromParent();
7555 B.buildInstr(AMDGPU::S_TRAP)
7557 MI.eraseFromParent();
7566 if (!ST.hasTrapHandler() ||
7570 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7573 B.buildInstr(AMDGPU::S_TRAP)
7577 MI.eraseFromParent();
7590 Register NodePtr =
MI.getOperand(2).getReg();
7591 Register RayExtent =
MI.getOperand(3).getReg();
7592 Register RayOrigin =
MI.getOperand(4).getReg();
7594 Register RayInvDir =
MI.getOperand(6).getReg();
7597 if (!ST.hasGFX10_AEncoding()) {
7600 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7607 const bool IsA16 =
MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7608 const bool Is64 =
MRI.getType(NodePtr).getSizeInBits() == 64;
7609 const unsigned NumVDataDwords = 4;
7610 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7611 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7613 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7615 const unsigned BaseOpcodes[2][2] = {
7616 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7617 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7618 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7622 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7623 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7624 : AMDGPU::MIMGEncGfx10NSA,
7625 NumVDataDwords, NumVAddrDwords);
7629 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7630 : AMDGPU::MIMGEncGfx10Default,
7631 NumVDataDwords, NumVAddrDwords);
7636 if (UseNSA && IsGFX11Plus) {
7638 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7639 auto Merged =
B.buildMergeLikeInstr(
7640 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7641 Ops.push_back(Merged.getReg(0));
7644 Ops.push_back(NodePtr);
7645 Ops.push_back(RayExtent);
7646 packLanes(RayOrigin);
7649 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7650 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7651 auto MergedDir =
B.buildMergeLikeInstr(
7654 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7655 UnmergeRayDir.getReg(0)}))
7658 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7659 UnmergeRayDir.getReg(1)}))
7662 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7663 UnmergeRayDir.getReg(2)}))
7665 Ops.push_back(MergedDir.getReg(0));
7668 packLanes(RayInvDir);
7672 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7673 Ops.push_back(Unmerge.getReg(0));
7674 Ops.push_back(Unmerge.getReg(1));
7676 Ops.push_back(NodePtr);
7678 Ops.push_back(RayExtent);
7681 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7682 Ops.push_back(Unmerge.getReg(0));
7683 Ops.push_back(Unmerge.getReg(1));
7684 Ops.push_back(Unmerge.getReg(2));
7687 packLanes(RayOrigin);
7689 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7690 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7694 B.buildMergeLikeInstr(R1,
7695 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7696 B.buildMergeLikeInstr(
7697 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7698 B.buildMergeLikeInstr(
7699 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7705 packLanes(RayInvDir);
7712 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7714 Ops.push_back(MergedOps);
7717 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7726 .addImm(IsA16 ? 1 : 0)
7729 MI.eraseFromParent();
7739 Register DstOrigin =
MI.getOperand(1).getReg();
7741 Register NodePtr =
MI.getOperand(4).getReg();
7742 Register RayExtent =
MI.getOperand(5).getReg();
7743 Register InstanceMask =
MI.getOperand(6).getReg();
7744 Register RayOrigin =
MI.getOperand(7).getReg();
7746 Register Offsets =
MI.getOperand(9).getReg();
7747 Register TDescr =
MI.getOperand(10).getReg();
7749 if (!ST.hasBVHDualAndBVH8Insts()) {
7752 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7757 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7758 const unsigned NumVDataDwords = 10;
7759 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7761 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7762 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7763 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7766 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7767 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7769 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7770 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7776 .addUse(RayExtentInstanceMaskVec.getReg(0))
7783 MI.eraseFromParent();
7792 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7793 MI.eraseFromParent();
7800 if (!ST.hasArchitectedSGPRs())
7804 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7805 auto LSB =
B.buildConstant(
S32, 25);
7806 auto Width =
B.buildConstant(
S32, 5);
7807 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7808 MI.eraseFromParent();
7816 unsigned Width)
const {
7819 if (!
MRI.getRegClassOrNull(DstReg))
7820 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7821 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7824 MI.eraseFromParent();
7838 if (
MRI.getType(Src) !=
S64)
7842 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7846 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7849 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7850 MI.eraseFromParent();
7858 if (
MRI.getType(Src) !=
S64)
7861 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
7865 .addReg(Unmerge.getReg(0));
7869 .addReg(Unmerge.getReg(1));
7870 MI.eraseFromParent();
7882 case Intrinsic::sponentry:
7888 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
7891 B.buildIntToPtr(DstReg, TmpReg);
7892 MI.eraseFromParent();
7894 int FI =
B.getMF().getFrameInfo().CreateFixedObject(
7896 B.buildFrameIndex(
MI.getOperand(0), FI);
7897 MI.eraseFromParent();
7900 case Intrinsic::amdgcn_if:
7901 case Intrinsic::amdgcn_else: {
7904 bool Negated =
false;
7916 std::swap(CondBrTarget, UncondBrTarget);
7918 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7919 if (IntrID == Intrinsic::amdgcn_if) {
7920 B.buildInstr(AMDGPU::SI_IF)
7923 .addMBB(UncondBrTarget);
7925 B.buildInstr(AMDGPU::SI_ELSE)
7928 .addMBB(UncondBrTarget);
7937 B.buildBr(*CondBrTarget);
7940 MRI.setRegClass(Def,
TRI->getWaveMaskRegClass());
7941 MRI.setRegClass(
Use,
TRI->getWaveMaskRegClass());
7942 MI.eraseFromParent();
7943 BrCond->eraseFromParent();
7949 case Intrinsic::amdgcn_loop: {
7952 bool Negated =
false;
7962 std::swap(CondBrTarget, UncondBrTarget);
7964 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7965 B.buildInstr(AMDGPU::SI_LOOP)
7967 .addMBB(UncondBrTarget);
7972 B.buildBr(*CondBrTarget);
7974 MI.eraseFromParent();
7975 BrCond->eraseFromParent();
7976 MRI.setRegClass(Reg,
TRI->getWaveMaskRegClass());
7982 case Intrinsic::amdgcn_addrspacecast_nonnull:
7984 case Intrinsic::amdgcn_make_buffer_rsrc:
7986 case Intrinsic::amdgcn_kernarg_segment_ptr:
7989 B.buildConstant(
MI.getOperand(0).getReg(), 0);
7990 MI.eraseFromParent();
7996 case Intrinsic::amdgcn_implicitarg_ptr:
7998 case Intrinsic::amdgcn_workitem_id_x:
8001 case Intrinsic::amdgcn_workitem_id_y:
8004 case Intrinsic::amdgcn_workitem_id_z:
8007 case Intrinsic::amdgcn_workgroup_id_x:
8012 case Intrinsic::amdgcn_workgroup_id_y:
8017 case Intrinsic::amdgcn_workgroup_id_z:
8022 case Intrinsic::amdgcn_cluster_id_x:
8023 return ST.hasClusters() &&
8026 case Intrinsic::amdgcn_cluster_id_y:
8027 return ST.hasClusters() &&
8030 case Intrinsic::amdgcn_cluster_id_z:
8031 return ST.hasClusters() &&
8034 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8035 return ST.hasClusters() &&
8038 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8039 return ST.hasClusters() &&
8042 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8043 return ST.hasClusters() &&
8046 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8047 return ST.hasClusters() &&
8049 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8050 return ST.hasClusters() &&
8053 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8054 return ST.hasClusters() &&
8057 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8058 return ST.hasClusters() &&
8061 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8062 return ST.hasClusters() &&
8066 case Intrinsic::amdgcn_wave_id:
8068 case Intrinsic::amdgcn_lds_kernel_id:
8071 case Intrinsic::amdgcn_dispatch_ptr:
8074 case Intrinsic::amdgcn_queue_ptr:
8077 case Intrinsic::amdgcn_implicit_buffer_ptr:
8080 case Intrinsic::amdgcn_dispatch_id:
8083 case Intrinsic::r600_read_ngroups_x:
8087 case Intrinsic::r600_read_ngroups_y:
8090 case Intrinsic::r600_read_ngroups_z:
8093 case Intrinsic::r600_read_local_size_x:
8096 case Intrinsic::r600_read_local_size_y:
8100 case Intrinsic::r600_read_local_size_z:
8103 case Intrinsic::amdgcn_fdiv_fast:
8105 case Intrinsic::amdgcn_is_shared:
8107 case Intrinsic::amdgcn_is_private:
8109 case Intrinsic::amdgcn_wavefrontsize: {
8110 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
8111 MI.eraseFromParent();
8114 case Intrinsic::amdgcn_s_buffer_load:
8116 case Intrinsic::amdgcn_raw_buffer_store:
8117 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8118 case Intrinsic::amdgcn_struct_buffer_store:
8119 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8121 case Intrinsic::amdgcn_raw_buffer_store_format:
8122 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8123 case Intrinsic::amdgcn_struct_buffer_store_format:
8124 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8126 case Intrinsic::amdgcn_raw_tbuffer_store:
8127 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8128 case Intrinsic::amdgcn_struct_tbuffer_store:
8129 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8131 case Intrinsic::amdgcn_raw_buffer_load:
8132 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8133 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8134 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8135 case Intrinsic::amdgcn_struct_buffer_load:
8136 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8137 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8138 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8140 case Intrinsic::amdgcn_raw_buffer_load_format:
8141 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8142 case Intrinsic::amdgcn_struct_buffer_load_format:
8143 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8145 case Intrinsic::amdgcn_raw_tbuffer_load:
8146 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8147 case Intrinsic::amdgcn_struct_tbuffer_load:
8148 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8150 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8151 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8152 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8153 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8154 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8155 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8156 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8157 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8158 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8159 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8160 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8161 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8162 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8163 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8164 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8165 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8166 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8167 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8168 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8169 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8170 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8171 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8172 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8173 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8174 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8175 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8176 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8177 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8178 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8179 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8180 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8181 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8182 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8183 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8184 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8185 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8186 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8187 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8188 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8189 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8190 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8191 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8192 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8193 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8194 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8195 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8196 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8197 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8198 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8199 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8200 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8201 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8202 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8203 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8204 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8205 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8206 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8207 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8208 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8209 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8210 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8211 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8212 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8213 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8214 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8215 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8216 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8217 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8218 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8219 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8220 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8221 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8223 case Intrinsic::amdgcn_rsq_clamp:
8225 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8227 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8228 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8230 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8231 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8232 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8233 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8234 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8235 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8236 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8237 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8240 LLT IndexArgTy =
MRI.getType(Index);
8241 if (IndexArgTy !=
S64) {
8242 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(
S64, Index)
8243 :
B.buildAnyExt(
S64, Index);
8244 MI.getOperand(5).setReg(NewIndex.getReg(0));
8248 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8249 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8250 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8251 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8252 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8253 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8254 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8255 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8258 if (
MRI.getType(Index) !=
S32)
8259 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
8262 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8263 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8264 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8265 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8266 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8267 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8268 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8269 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8270 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8272 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8275 LLT IndexArgTy =
MRI.getType(Index);
8276 if (IndexArgTy != IdxTy) {
8277 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(IdxTy, Index)
8278 :
B.buildAnyExt(IdxTy, Index);
8279 MI.getOperand(7).setReg(NewIndex.getReg(0));
8284 case Intrinsic::amdgcn_fmed3: {
8290 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8291 MI.removeOperand(1);
8295 case Intrinsic::amdgcn_readlane:
8296 case Intrinsic::amdgcn_writelane:
8297 case Intrinsic::amdgcn_readfirstlane:
8298 case Intrinsic::amdgcn_permlane16:
8299 case Intrinsic::amdgcn_permlanex16:
8300 case Intrinsic::amdgcn_permlane64:
8301 case Intrinsic::amdgcn_set_inactive:
8302 case Intrinsic::amdgcn_set_inactive_chain_arg:
8303 case Intrinsic::amdgcn_mov_dpp8:
8304 case Intrinsic::amdgcn_update_dpp:
8306 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8308 case Intrinsic::amdgcn_dead: {
8312 MI.eraseFromParent();
8315 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8316 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8317 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8318 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8319 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8320 MI.eraseFromParent();
8322 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8323 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8324 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8325 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8326 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8327 MI.eraseFromParent();
8329 case Intrinsic::amdgcn_flat_load_monitor_b32:
8330 case Intrinsic::amdgcn_flat_load_monitor_b64:
8331 case Intrinsic::amdgcn_flat_load_monitor_b128:
8332 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8333 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8334 .add(
MI.getOperand(0))
8335 .add(
MI.getOperand(2))
8336 .addMemOperand(*
MI.memoperands_begin());
8337 MI.eraseFromParent();
8339 case Intrinsic::amdgcn_global_load_monitor_b32:
8340 case Intrinsic::amdgcn_global_load_monitor_b64:
8341 case Intrinsic::amdgcn_global_load_monitor_b128:
8342 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8343 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8344 .add(
MI.getOperand(0))
8345 .add(
MI.getOperand(2))
8346 .addMemOperand(*
MI.memoperands_begin());
8347 MI.eraseFromParent();
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
bool isBottomOfStack() const
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isEntryFunction() const
bool isModuleEntryFunction() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & lowerIf(LegalityPredicate Predicate)
The instruction is lowered if predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LLVM_ABI LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.