37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
40#define DEBUG_TYPE "amdgpu-legalinfo"
50 "amdgpu-global-isel-new-legality",
51 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
67 unsigned Bits = Ty.getSizeInBits();
77 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
91 const LLT Ty = Query.Types[TypeIdx];
98 const LLT Ty = Query.Types[TypeIdx];
100 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
106 const LLT Ty = Query.Types[TypeIdx];
108 return std::pair(TypeIdx,
115 const LLT Ty = Query.Types[TypeIdx];
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (
Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
129 const LLT Ty = Query.Types[TypeIdx];
132 const int Size = Ty.getSizeInBits();
134 const int NextMul32 = (
Size + 31) / 32;
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
154 const LLT Ty = Query.Types[TypeIdx];
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
159 assert(EltSize == 32 || EltSize == 64);
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
168 return std::pair(TypeIdx,
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
188 const unsigned Size = Ty.getSizeInBits();
201 const LLT Ty = Query.Types[TypeIdx];
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
218 const LLT QueryTy = Query.Types[TypeIdx];
225 const LLT QueryTy = Query.Types[TypeIdx];
232 const LLT QueryTy = Query.Types[TypeIdx];
238 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
244 return EltSize == 16 || EltSize % 32 == 0;
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
280 LLT Ty = Query.Types[TypeIdx];
288 const LLT QueryTy = Query.Types[TypeIdx];
372 if (Ty.isPointerOrPointerVector())
373 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
377 (ST.useRealTrue16Insts() && Ty ==
S16) ||
392 const LLT Ty = Query.Types[TypeIdx];
393 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
394 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
402 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
412 bool IsLoad,
bool IsAtomic) {
416 return ST.hasFlatScratchEnabled() ? 128 : 32;
418 return ST.useDS128() ? 128 : 64;
429 return IsLoad ? 512 : 128;
434 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
443 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
445 unsigned RegSize = Ty.getSizeInBits();
448 unsigned AS = Query.
Types[1].getAddressSpace();
455 if (Ty.isVector() && MemSize !=
RegSize)
462 if (IsLoad && MemSize <
Size)
463 MemSize = std::max(MemSize,
Align);
483 if (!ST.hasDwordx3LoadStores())
496 if (AlignBits < MemSize) {
499 Align(AlignBits / 8)))
529 const unsigned Size = Ty.getSizeInBits();
530 if (Ty.isPointerVector())
540 unsigned EltSize = Ty.getScalarSizeInBits();
541 return EltSize != 32 && EltSize != 64;
555 const unsigned Size = Ty.getSizeInBits();
556 if (
Size != MemSizeInBits)
557 return Size <= 32 && Ty.isVector();
563 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
572 uint64_t AlignInBits,
unsigned AddrSpace,
582 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
593 if (AlignInBits < RoundedSize)
600 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
612 Query.
Types[1].getAddressSpace(), Opcode);
632 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
636 std::array<Register, 4> VectorElems;
637 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
638 for (
unsigned I = 0;
I < NumParts; ++
I)
640 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
641 B.buildMergeValues(MO, VectorElems);
646 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
647 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
648 B.buildIntToPtr(MO, Scalar);
668 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
669 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
670 for (
unsigned I = 0;
I < NumParts; ++
I)
672 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
674 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
675 return B.buildBitcast(VectorTy, Scalar).getReg(0);
694 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
707 const LLT BufferStridedPtr =
710 const LLT CodePtr = FlatPtr;
712 const std::initializer_list<LLT> AddrSpaces64 = {
713 GlobalPtr, ConstantPtr, FlatPtr
716 const std::initializer_list<LLT> AddrSpaces32 = {
717 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
720 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
722 const std::initializer_list<LLT> FPTypesBase = {
726 const std::initializer_list<LLT> FPTypes16 = {
730 const std::initializer_list<LLT> FPTypesPK16 = {
734 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
755 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
757 if (ST.hasScalarAddSub64()) {
760 .clampMaxNumElementsStrict(0,
S16, 2)
768 .clampMaxNumElementsStrict(0,
S16, 2)
775 if (ST.hasScalarSMulU64()) {
778 .clampMaxNumElementsStrict(0,
S16, 2)
786 .clampMaxNumElementsStrict(0,
S16, 2)
796 .minScalarOrElt(0,
S16)
801 }
else if (ST.has16BitInsts()) {
835 .widenScalarToNextMultipleOf(0, 32)
845 if (ST.hasMad64_32())
850 if (ST.hasIntClamp()) {
873 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
883 if (ST.hasVOP3PInsts()) {
885 .clampMaxNumElements(0,
S8, 2)
906 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
918 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
925 .clampScalar(0,
S16,
S64);
958 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
959 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
966 if (ST.has16BitInsts()) {
967 if (ST.hasVOP3PInsts())
970 FPOpActions.legalFor({
S16});
972 TrigActions.customFor({
S16});
973 FDIVActions.customFor({
S16});
976 if (ST.hasPackedFP32Ops()) {
977 FPOpActions.legalFor({
V2S32});
978 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
981 auto &MinNumMaxNumIeee =
984 if (ST.hasVOP3PInsts()) {
985 MinNumMaxNumIeee.legalFor(FPTypesPK16)
987 .clampMaxNumElements(0,
S16, 2)
990 }
else if (ST.has16BitInsts()) {
991 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
993 MinNumMaxNumIeee.legalFor(FPTypesBase)
999 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1001 if (ST.hasVOP3PInsts()) {
1002 MinNumMaxNum.customFor(FPTypesPK16)
1004 .clampMaxNumElements(0,
S16, 2)
1005 .clampScalar(0,
S16,
S64)
1007 }
else if (ST.has16BitInsts()) {
1008 MinNumMaxNum.customFor(FPTypes16)
1009 .clampScalar(0,
S16,
S64)
1012 MinNumMaxNum.customFor(FPTypesBase)
1013 .clampScalar(0,
S32,
S64)
1017 if (ST.hasVOP3PInsts())
1033 .legalFor(FPTypesPK16)
1038 if (ST.has16BitInsts()) {
1072 if (ST.hasFractBug()) {
1106 if (ST.hasCvtPkF16F32Inst()) {
1108 .clampMaxNumElements(0,
S16, 2);
1112 FPTruncActions.scalarize(0).lower();
1120 if (ST.has16BitInsts()) {
1140 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1141 FMad.customFor({
S32,
S16});
1142 else if (ST.hasMadMacF32Insts())
1143 FMad.customFor({
S32});
1144 else if (ST.hasMadF16())
1145 FMad.customFor({
S16});
1150 if (ST.has16BitInsts()) {
1153 FRem.minScalar(0,
S32)
1162 .clampMaxNumElements(0,
S16, 2)
1181 if (ST.has16BitInsts())
1192 if (ST.has16BitInsts())
1206 if (ST.has16BitInsts())
1217 .clampScalar(0,
S16,
S64)
1232 .clampScalar(0,
S16,
S64)
1236 if (ST.has16BitInsts()) {
1238 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1240 .clampScalar(0,
S16,
S64)
1244 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1246 .clampScalar(0,
S32,
S64)
1250 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1253 .clampScalar(0,
S32,
S64)
1265 .scalarSameSizeAs(1, 0)
1281 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1282 .legalForCartesianProduct(
1283 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1284 if (ST.has16BitInsts()) {
1285 CmpBuilder.legalFor({{
S1,
S16}});
1296 {
S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1298 if (ST.hasSALUFloatInsts())
1308 if (ST.has16BitInsts())
1309 ExpOps.customFor({{
S32}, {
S16}});
1311 ExpOps.customFor({
S32});
1312 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1320 .
legalFor(ST.has16BitInsts(), {S16})
1326 .
legalFor(ST.has16BitInsts(), {S16})
1340 .clampScalar(0,
S32,
S32)
1347 if (ST.has16BitInsts())
1350 .widenScalarToNextPow2(1)
1356 .lowerFor({
S1,
S16})
1357 .widenScalarToNextPow2(1)
1384 .clampScalar(0,
S32,
S32)
1394 .clampScalar(0,
S32,
S64)
1398 if (ST.has16BitInsts()) {
1401 .clampMaxNumElementsStrict(0,
S16, 2)
1408 if (ST.hasVOP3PInsts()) {
1411 .clampMaxNumElements(0,
S16, 2)
1416 if (ST.hasIntMinMax64()) {
1419 .clampMaxNumElements(0,
S16, 2)
1427 .clampMaxNumElements(0,
S16, 2)
1436 .widenScalarToNextPow2(0)
1464 .legalForCartesianProduct(AddrSpaces32, {
S32})
1480 .legalForCartesianProduct(AddrSpaces32, {
S32})
1497 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1498 bool IsLoad) ->
bool {
1502 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1516 unsigned NumRegs = (MemSize + 31) / 32;
1518 if (!ST.hasDwordx3LoadStores())
1529 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1530 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1531 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1537 for (
unsigned Op : {G_LOAD, G_STORE}) {
1538 const bool IsStore =
Op == G_STORE;
1543 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1546 {
S64, GlobalPtr,
S64, GlobalAlign32},
1549 {
S32, GlobalPtr,
S8, GlobalAlign8},
1550 {
S32, GlobalPtr,
S16, GlobalAlign16},
1552 {
S32, LocalPtr,
S32, 32},
1553 {
S64, LocalPtr,
S64, 32},
1555 {
S32, LocalPtr,
S8, 8},
1556 {
S32, LocalPtr,
S16, 16},
1559 {
S32, PrivatePtr,
S32, 32},
1560 {
S32, PrivatePtr,
S8, 8},
1561 {
S32, PrivatePtr,
S16, 16},
1564 {
S32, ConstantPtr,
S32, GlobalAlign32},
1567 {
S64, ConstantPtr,
S64, GlobalAlign32},
1568 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1577 Actions.unsupportedIf(
1578 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1592 Actions.customIf(
typeIs(1, Constant32Ptr));
1618 return !Query.
Types[0].isVector() &&
1619 needToSplitMemOp(Query,
Op == G_LOAD);
1621 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1626 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1629 if (DstSize > MemSize)
1635 if (MemSize > MaxSize)
1643 return Query.
Types[0].isVector() &&
1644 needToSplitMemOp(Query,
Op == G_LOAD);
1646 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1660 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1661 if (MemSize > MaxSize) {
1665 if (MaxSize % EltSize == 0) {
1671 unsigned NumPieces = MemSize / MaxSize;
1675 if (NumPieces == 1 || NumPieces >= NumElts ||
1676 NumElts % NumPieces != 0)
1677 return std::pair(0, EltTy);
1685 return std::pair(0, EltTy);
1700 return std::pair(0, EltTy);
1705 .widenScalarToNextPow2(0)
1712 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1713 {
S32, GlobalPtr,
S16, 2 * 8},
1714 {
S32, LocalPtr,
S8, 8},
1715 {
S32, LocalPtr,
S16, 16},
1716 {
S32, PrivatePtr,
S8, 8},
1717 {
S32, PrivatePtr,
S16, 16},
1718 {
S32, ConstantPtr,
S8, 8},
1719 {
S32, ConstantPtr,
S16, 2 * 8}})
1725 if (ST.hasFlatAddressSpace()) {
1726 ExtLoads.legalForTypesWithMemDesc(
1727 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1742 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1743 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1744 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1745 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1746 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1747 {
S64, GlobalPtr}, {
S64, LocalPtr},
1748 {
S32, RegionPtr}, {
S64, RegionPtr}});
1749 if (ST.hasFlatAddressSpace()) {
1750 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1755 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr}, {
S32, RegionPtr}});
1756 if (ST.hasFlatAddressSpace()) {
1757 Atomics32.legalFor({{
S32, FlatPtr}});
1762 if (ST.hasLDSFPAtomicAddF32()) {
1763 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1764 if (ST.hasLdsAtomicAddF64())
1765 Atomic.legalFor({{
S64, LocalPtr}});
1766 if (ST.hasAtomicDsPkAdd16Insts())
1767 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1769 if (ST.hasAtomicFaddInsts())
1770 Atomic.legalFor({{
S32, GlobalPtr}});
1771 if (ST.hasFlatAtomicFaddF32Inst())
1772 Atomic.legalFor({{
S32, FlatPtr}});
1774 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1785 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1786 ST.hasAtomicBufferGlobalPkAddF16Insts())
1787 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1788 if (ST.hasAtomicGlobalPkAddBF16Inst())
1789 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1790 if (ST.hasAtomicFlatPkAdd16Insts())
1791 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1796 auto &AtomicFMinFMax =
1798 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1800 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1802 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1803 AtomicFMinFMax.
legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1804 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1806 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1813 {
S32, FlatPtr}, {
S64, FlatPtr}})
1814 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1815 {
S32, RegionPtr}, {
S64, RegionPtr}});
1821 LocalPtr, FlatPtr, PrivatePtr,
1825 .clampScalar(0,
S16,
S64)
1840 if (ST.has16BitInsts()) {
1841 if (ST.hasVOP3PInsts()) {
1843 .clampMaxNumElements(0,
S16, 2);
1845 Shifts.legalFor({{
S16,
S16}});
1848 Shifts.widenScalarIf(
1853 const LLT AmountTy = Query.
Types[1];
1854 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1858 Shifts.clampScalar(1,
S32,
S32);
1859 Shifts.widenScalarToNextPow2(0, 16);
1860 Shifts.clampScalar(0,
S16,
S64);
1870 Shifts.clampScalar(1,
S32,
S32);
1871 Shifts.widenScalarToNextPow2(0, 32);
1872 Shifts.clampScalar(0,
S32,
S64);
1881 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1882 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1883 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1884 unsigned IdxTypeIdx = 2;
1888 const LLT EltTy = Query.
Types[EltTypeIdx];
1889 const LLT VecTy = Query.
Types[VecTypeIdx];
1890 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1892 const bool isLegalVecType =
1902 return (EltSize == 32 || EltSize == 64) &&
1918 const LLT EltTy = Query.
Types[EltTypeIdx];
1919 const LLT VecTy = Query.
Types[VecTypeIdx];
1923 const unsigned TargetEltSize =
1924 DstEltSize % 64 == 0 ? 64 : 32;
1925 return std::pair(VecTypeIdx,
1929 .clampScalar(EltTypeIdx,
S32,
S64)
1943 const LLT &EltTy = Query.
Types[1].getElementType();
1944 return Query.
Types[0] != EltTy;
1947 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1948 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1949 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1953 const LLT BigTy = Query.
Types[BigTyIdx];
1959 const LLT LitTy = Query.
Types[LitTyIdx];
1972 const LLT BigTy = Query.
Types[BigTyIdx];
1973 const LLT LitTy = Query.
Types[LitTyIdx];
1990 if (ST.hasScalarPackInsts()) {
1993 .minScalarOrElt(0,
S16)
2000 BuildVector.customFor({
V2S16,
S16});
2001 BuildVector.minScalarOrElt(0,
S32);
2020 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2021 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
2022 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
2024 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
2025 const LLT Ty = Query.
Types[TypeIdx];
2026 if (Ty.isVector()) {
2041 const LLT BigTy = Query.
Types[BigTyIdx];
2061 return notValidElt(Query, LitTyIdx);
2066 return notValidElt(Query, BigTyIdx);
2071 if (
Op == G_MERGE_VALUES) {
2072 Builder.widenScalarIf(
2075 const LLT Ty = Query.
Types[LitTyIdx];
2076 return Ty.getSizeInBits() < 32;
2083 const LLT Ty = Query.
Types[BigTyIdx];
2084 return Ty.getSizeInBits() % 16 != 0;
2089 const LLT &Ty = Query.
Types[BigTyIdx];
2090 unsigned NewSizeInBits = 1 <<
Log2_32_Ceil(Ty.getSizeInBits() + 1);
2091 if (NewSizeInBits >= 256) {
2092 unsigned RoundedTo =
alignTo<64>(Ty.getSizeInBits() + 1);
2093 if (RoundedTo < NewSizeInBits)
2094 NewSizeInBits = RoundedTo;
2096 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2107 .clampScalar(0,
S32,
S64);
2109 if (ST.hasVOP3PInsts()) {
2110 SextInReg.lowerFor({{
V2S16}})
2114 .clampMaxNumElementsStrict(0,
S16, 2);
2115 }
else if (ST.has16BitInsts()) {
2116 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2120 SextInReg.lowerFor({{
S32}, {
S64}});
2133 FSHRActionDefs.legalFor({{
S32,
S32}})
2134 .clampMaxNumElementsStrict(0,
S16, 2);
2135 if (ST.hasVOP3PInsts())
2137 FSHRActionDefs.scalarize(0).lower();
2139 if (ST.hasVOP3PInsts()) {
2142 .clampMaxNumElementsStrict(0,
S16, 2)
2166 .clampScalar(1,
S32,
S32)
2175 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2176 G_READ_REGISTER, G_WRITE_REGISTER,
2181 if (ST.hasIEEEMinimumMaximumInsts()) {
2183 .legalFor(FPTypesPK16)
2186 }
else if (ST.hasVOP3PInsts()) {
2189 .clampMaxNumElementsStrict(0,
S16, 2)
2205 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2206 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2212 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2213 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2214 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2215 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2221 verify(*ST.getInstrInfo());
2230 switch (
MI.getOpcode()) {
2231 case TargetOpcode::G_ADDRSPACE_CAST:
2233 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2235 case TargetOpcode::G_FCEIL:
2237 case TargetOpcode::G_FREM:
2239 case TargetOpcode::G_INTRINSIC_TRUNC:
2241 case TargetOpcode::G_SITOFP:
2243 case TargetOpcode::G_UITOFP:
2245 case TargetOpcode::G_FPTOSI:
2247 case TargetOpcode::G_FPTOUI:
2249 case TargetOpcode::G_FMINNUM:
2250 case TargetOpcode::G_FMAXNUM:
2251 case TargetOpcode::G_FMINIMUMNUM:
2252 case TargetOpcode::G_FMAXIMUMNUM:
2254 case TargetOpcode::G_EXTRACT:
2256 case TargetOpcode::G_INSERT:
2258 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2260 case TargetOpcode::G_INSERT_VECTOR_ELT:
2262 case TargetOpcode::G_FSIN:
2263 case TargetOpcode::G_FCOS:
2265 case TargetOpcode::G_GLOBAL_VALUE:
2267 case TargetOpcode::G_LOAD:
2268 case TargetOpcode::G_SEXTLOAD:
2269 case TargetOpcode::G_ZEXTLOAD:
2271 case TargetOpcode::G_STORE:
2273 case TargetOpcode::G_FMAD:
2275 case TargetOpcode::G_FDIV:
2277 case TargetOpcode::G_FFREXP:
2279 case TargetOpcode::G_FSQRT:
2281 case TargetOpcode::G_UDIV:
2282 case TargetOpcode::G_UREM:
2283 case TargetOpcode::G_UDIVREM:
2285 case TargetOpcode::G_SDIV:
2286 case TargetOpcode::G_SREM:
2287 case TargetOpcode::G_SDIVREM:
2289 case TargetOpcode::G_ATOMIC_CMPXCHG:
2291 case TargetOpcode::G_FLOG2:
2293 case TargetOpcode::G_FLOG:
2294 case TargetOpcode::G_FLOG10:
2296 case TargetOpcode::G_FEXP2:
2298 case TargetOpcode::G_FEXP:
2299 case TargetOpcode::G_FEXP10:
2301 case TargetOpcode::G_FPOW:
2303 case TargetOpcode::G_FFLOOR:
2305 case TargetOpcode::G_BUILD_VECTOR:
2306 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2308 case TargetOpcode::G_MUL:
2310 case TargetOpcode::G_CTLZ:
2311 case TargetOpcode::G_CTTZ:
2313 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2315 case TargetOpcode::G_STACKSAVE:
2317 case TargetOpcode::G_GET_FPENV:
2319 case TargetOpcode::G_SET_FPENV:
2321 case TargetOpcode::G_TRAP:
2323 case TargetOpcode::G_DEBUGTRAP:
2343 if (ST.hasApertureRegs()) {
2348 ? AMDGPU::SRC_SHARED_BASE
2349 : AMDGPU::SRC_PRIVATE_BASE;
2350 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2351 !ST.hasGloballyAddressableScratch()) &&
2352 "Cannot use src_private_base with globally addressable scratch!");
2355 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2356 return B.buildUnmerge(
S32, Dst).getReg(1);
2371 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2387 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2390 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2412 B.buildObjectPtrOffset(
2414 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2415 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2423 switch (Def->getOpcode()) {
2424 case AMDGPU::G_FRAME_INDEX:
2425 case AMDGPU::G_GLOBAL_VALUE:
2426 case AMDGPU::G_BLOCK_ADDR:
2428 case AMDGPU::G_CONSTANT: {
2429 const ConstantInt *CI = Def->getOperand(1).getCImm();
2446 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2448 Intrinsic::amdgcn_addrspacecast_nonnull));
2453 :
MI.getOperand(1).getReg();
2457 unsigned SrcAS = SrcTy.getAddressSpace();
2467 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2474 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2476 ST.hasGloballyAddressableScratch()) {
2480 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2482 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2483 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2485 MRI.
setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2487 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2491 return B.buildExtract(Dst, Src, 0).getReg(0);
2497 castFlatToLocalOrPrivate(Dst);
2498 MI.eraseFromParent();
2504 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2505 auto FlatNull =
B.buildConstant(SrcTy, 0);
2508 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2512 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2514 MI.eraseFromParent();
2521 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2524 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2527 ST.hasGloballyAddressableScratch()) {
2532 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2536 if (ST.isWave64()) {
2537 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2543 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2544 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2546 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2550 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2551 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2553 MRI.
setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2554 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2563 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2569 castLocalOrPrivateToFlat(Dst);
2570 MI.eraseFromParent();
2574 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2581 SegmentNull.getReg(0));
2583 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2585 MI.eraseFromParent();
2590 SrcTy.getSizeInBits() == 64) {
2592 B.buildExtract(Dst, Src, 0);
2593 MI.eraseFromParent();
2600 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2601 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2602 if (AddrHiVal == 0) {
2604 B.buildIntToPtr(Dst, Zext);
2606 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2607 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2610 MI.eraseFromParent();
2617 MI.eraseFromParent();
2626 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2631 auto C1 =
B.buildFConstant(Ty, C1Val);
2632 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2635 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2636 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2638 auto C2 =
B.buildFConstant(Ty, C2Val);
2639 auto Fabs =
B.buildFAbs(Ty, Src);
2642 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2643 MI.eraseFromParent();
2661 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2663 const auto Zero =
B.buildFConstant(
S64, 0.0);
2664 const auto One =
B.buildFConstant(
S64, 1.0);
2667 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2668 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2671 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2672 MI.eraseFromParent();
2680 Register Src0Reg =
MI.getOperand(1).getReg();
2681 Register Src1Reg =
MI.getOperand(2).getReg();
2682 auto Flags =
MI.getFlags();
2685 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2686 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2687 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2688 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2689 MI.eraseFromParent();
2695 const unsigned FractBits = 52;
2696 const unsigned ExpBits = 11;
2699 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2700 auto Const1 =
B.buildConstant(
S32, ExpBits);
2702 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2704 .addUse(Const0.getReg(0))
2705 .addUse(Const1.getReg(0));
2707 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2721 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2728 const unsigned FractBits = 52;
2731 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2732 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2734 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2736 const auto Zero32 =
B.buildConstant(
S32, 0);
2739 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2741 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2742 auto Not =
B.buildNot(
S64, Shr);
2743 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2744 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2749 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2750 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2751 MI.eraseFromParent();
2767 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2768 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2771 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2772 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2774 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2775 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2778 B.buildFAdd(Dst, LdExp, CvtLo);
2779 MI.eraseFromParent();
2785 auto One =
B.buildConstant(
S32, 1);
2789 auto ThirtyOne =
B.buildConstant(
S32, 31);
2790 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2791 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2792 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2793 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2794 .addUse(Unmerge.getReg(1));
2795 auto LS2 =
B.buildSub(
S32, LS, One);
2796 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2798 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2799 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2800 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2801 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2802 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2803 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2804 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2805 B.buildFLdexp(Dst, FVal, Scale);
2806 MI.eraseFromParent();
2826 unsigned Flags =
MI.getFlags();
2837 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2845 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2846 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2850 K0 =
B.buildFConstant(
2852 K1 =
B.buildFConstant(
2855 K0 =
B.buildFConstant(
2857 K1 =
B.buildFConstant(
2861 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2862 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2863 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2866 :
B.buildFPTOUI(
S32, FloorMul);
2867 auto Lo =
B.buildFPTOUI(
S32, Fma);
2871 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2873 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2876 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2877 MI.eraseFromParent();
2909 unsigned StartIdx =
Offset / 32;
2911 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32), SrcReg);
2913 if (DstCount == 1) {
2915 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2920 for (
unsigned I = 0;
I < DstCount; ++
I)
2921 MergeVec.
push_back(Unmerge.getReg(StartIdx +
I));
2922 B.buildMergeLikeInstr(DstReg, MergeVec);
2925 MI.eraseFromParent();
2935 Register InsertSrc =
MI.getOperand(2).getReg();
2944 if (
Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2948 unsigned DstCount = DstSize / 32;
2949 unsigned InsertCount = InsertSize / 32;
2950 unsigned StartIdx =
Offset / 32;
2952 auto SrcUnmerge =
B.buildUnmerge(
S32, SrcReg);
2955 for (
unsigned I = 0;
I < StartIdx; ++
I)
2958 if (InsertCount == 1) {
2962 InsertSrc =
B.buildPtrToInt(
S32, InsertSrc).getReg(0);
2965 auto InsertUnmerge =
B.buildUnmerge(
S32, InsertSrc);
2966 for (
unsigned I = 0;
I < InsertCount; ++
I)
2970 for (
unsigned I = StartIdx + InsertCount;
I < DstCount; ++
I)
2973 B.buildMergeLikeInstr(DstReg, MergeVec);
2975 MI.eraseFromParent();
3002 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
3003 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
3004 B.buildIntToPtr(Dst, IntElt);
3006 MI.eraseFromParent();
3013 std::optional<ValueAndVReg> MaybeIdxVal =
3017 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3020 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
3021 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3026 MI.eraseFromParent();
3055 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
3056 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
3057 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3059 B.buildIntToPtr(Dst, IntVecDest);
3060 MI.eraseFromParent();
3067 std::optional<ValueAndVReg> MaybeIdxVal =
3072 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3075 if (IdxVal < NumElts) {
3077 for (
unsigned i = 0; i < NumElts; ++i)
3079 B.buildUnmerge(SrcRegs, Vec);
3081 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
3082 B.buildMergeLikeInstr(Dst, SrcRegs);
3087 MI.eraseFromParent();
3098 unsigned Flags =
MI.getFlags();
3102 if (ST.hasTrigReducedRange()) {
3103 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3104 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3105 .addUse(MulVal.getReg(0))
3109 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3112 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3116 MI.eraseFromParent();
3124 unsigned GAFlags)
const {
3153 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3155 if (ST.has64BitLiterals()) {
3159 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3163 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3172 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3173 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3176 B.buildExtract(DstReg, PCReg, 0);
3186 if (RequiresHighHalf && ST.has64BitLiterals()) {
3188 MRI.
setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3189 B.buildInstr(AMDGPU::S_MOV_B64)
3204 MRI.
setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3207 B.buildInstr(AMDGPU::S_MOV_B32)
3212 if (RequiresHighHalf) {
3214 "Must provide a 64-bit pointer type!");
3217 MRI.
setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3219 B.buildInstr(AMDGPU::S_MOV_B32)
3230 MRI.
setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3232 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3236 if (AddrDst != DstReg)
3237 B.buildCast(DstReg, AddrDst);
3238 }
else if (AddrLo != DstReg) {
3241 B.buildCast(DstReg, AddrLo);
3258 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3262 Fn,
"local memory global used by non-kernel function",
3271 B.buildUndef(DstReg);
3272 MI.eraseFromParent();
3296 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3297 B.buildIntToPtr(DstReg, Sz);
3298 MI.eraseFromParent();
3304 MI.eraseFromParent();
3308 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3310 MI.eraseFromParent();
3318 MI.eraseFromParent();
3324 MI.eraseFromParent();
3340 if (Ty.getSizeInBits() == 32) {
3342 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3343 B.buildExtract(DstReg, Load, 0);
3345 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3347 MI.eraseFromParent();
3370 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3372 MI.getOperand(1).setReg(Cast.getReg(0));
3377 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3391 const unsigned ValSize = ValTy.getSizeInBits();
3403 if (WideMemSize == ValSize) {
3409 MI.setMemRefs(MF, {WideMMO});
3415 if (ValSize > WideMemSize)
3422 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3423 B.buildTrunc(ValReg, WideLoad).getReg(0);
3430 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3431 B.buildExtract(ValReg, WideLoad, 0);
3435 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3436 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3440 MI.eraseFromParent();
3453 Register DataReg =
MI.getOperand(0).getReg();
3497 "this should not have been custom lowered");
3502 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3504 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3508 .setMemRefs(
MI.memoperands());
3510 MI.eraseFromParent();
3518 switch (
DefMI->getOpcode()) {
3519 case TargetOpcode::G_INTRINSIC: {
3521 case Intrinsic::amdgcn_frexp_mant:
3522 case Intrinsic::amdgcn_log:
3523 case Intrinsic::amdgcn_log_clamp:
3524 case Intrinsic::amdgcn_exp2:
3525 case Intrinsic::amdgcn_sqrt:
3533 case TargetOpcode::G_FSQRT:
3535 case TargetOpcode::G_FFREXP: {
3536 if (
DefMI->getOperand(0).getReg() == Src)
3540 case TargetOpcode::G_FPEXT: {
3561std::pair<Register, Register>
3563 unsigned Flags)
const {
3568 auto SmallestNormal =
B.buildFConstant(
3570 auto IsLtSmallestNormal =
3573 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3574 auto One =
B.buildFConstant(
F32, 1.0);
3576 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3577 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3579 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3592 LLT Ty =
B.getMRI()->getType(Dst);
3593 unsigned Flags =
MI.getFlags();
3598 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3599 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3600 .addUse(Ext.getReg(0))
3602 B.buildFPTrunc(Dst,
Log2, Flags);
3603 MI.eraseFromParent();
3611 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3614 MI.eraseFromParent();
3618 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3619 .addUse(ScaledInput)
3622 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3623 auto Zero =
B.buildFConstant(Ty, 0.0);
3625 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3626 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3628 MI.eraseFromParent();
3634 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3635 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3640 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3641 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3646 unsigned Flags =
MI.getFlags();
3659 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3661 B.buildFPTrunc(Dst, LogVal);
3666 MI.eraseFromParent();
3675 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3678 if (ST.hasFastFMAF32()) {
3680 const float c_log10 = 0x1.344134p-2f;
3681 const float cc_log10 = 0x1.09f79ep-26f;
3684 const float c_log = 0x1.62e42ep-1f;
3685 const float cc_log = 0x1.efa39ep-25f;
3687 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3688 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3692 R =
B.buildFMul(Ty,
Y,
C, NewFlags).getReg(0);
3693 auto NegR =
B.buildFNeg(Ty, R, NewFlags);
3694 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, NewFlags);
3695 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, NewFlags);
3696 R =
B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3699 const float ch_log10 = 0x1.344000p-2f;
3700 const float ct_log10 = 0x1.3509f6p-18f;
3703 const float ch_log = 0x1.62e000p-1f;
3704 const float ct_log = 0x1.0bfbe8p-15f;
3706 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3707 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3709 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3710 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3711 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3715 auto YTCT =
B.buildFMul(Ty, YT, CT, NewFlags);
3718 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3720 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, NewFlags);
3723 const bool IsFiniteOnly =
3726 if (!IsFiniteOnly) {
3729 auto Fabs =
B.buildFAbs(Ty,
Y);
3732 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3736 auto Zero =
B.buildFConstant(Ty, 0.0);
3738 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3739 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3740 B.buildFSub(Dst, R, Shift, Flags);
3742 B.buildCopy(Dst, R);
3745 MI.eraseFromParent();
3751 unsigned Flags)
const {
3752 const double Log2BaseInverted =
3755 LLT Ty =
B.getMRI()->getType(Dst);
3760 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3763 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3764 auto Zero =
B.buildFConstant(Ty, 0.0);
3766 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3767 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3769 if (ST.hasFastFMAF32())
3770 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3772 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3773 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3781 ?
B.buildFLog2(Ty, Src, Flags)
3782 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3785 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3786 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3797 unsigned Flags =
MI.getFlags();
3798 LLT Ty =
B.getMRI()->getType(Dst);
3808 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3809 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3810 .addUse(Ext.getReg(0))
3812 B.buildFPTrunc(Dst,
Log2, Flags);
3813 MI.eraseFromParent();
3823 MI.eraseFromParent();
3831 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3833 RangeCheckConst, Flags);
3835 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3836 auto Zero =
B.buildFConstant(Ty, 0.0);
3837 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3838 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3840 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3841 .addUse(AddInput.getReg(0))
3844 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3845 auto One =
B.buildFConstant(Ty, 1.0);
3846 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3847 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3848 MI.eraseFromParent();
3853 const SrcOp &Src,
unsigned Flags) {
3854 LLT Ty = Dst.getLLTTy(*
B.getMRI());
3857 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3858 .addUse(Src.getReg())
3861 return B.buildFExp2(Dst, Src, Flags);
3867 bool IsExp10)
const {
3868 LLT Ty =
B.getMRI()->getType(
X);
3872 auto Const =
B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f :
numbers::log2e);
3873 auto Mul =
B.buildFMul(Ty,
X, Const, Flags);
3880 LLT Ty =
B.getMRI()->getType(Dst);
3887 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3890 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3891 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3892 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3895 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3897 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3898 .addUse(ExpInput.getReg(0))
3901 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3902 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3903 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3909 unsigned Flags)
const {
3910 LLT Ty =
B.getMRI()->getType(Dst);
3915 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3916 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3918 auto Mul1 =
B.buildFMul(Ty,
X, K1, Flags);
3919 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3920 auto Mul0 =
B.buildFMul(Ty,
X, K0, Flags);
3921 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3922 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3932 auto Threshold =
B.buildFConstant(Ty, -0x1.2f7030p+5f);
3936 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+5f);
3937 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3938 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X);
3940 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3941 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3943 auto Mul1 =
B.buildFMul(Ty, AdjustedX, K1, Flags);
3944 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3945 auto Mul0 =
B.buildFMul(Ty, AdjustedX, K0, Flags);
3946 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3948 auto MulExps =
B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3949 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.9f623ep-107f);
3950 auto AdjustedResult =
B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3952 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3971 if (
MI.getOpcode() == TargetOpcode::G_FEXP2) {
3973 Dn =
B.buildFRint(
S64,
X, Flags).getReg(0);
3975 F =
B.buildFSub(
S64,
X, Dn, Flags).getReg(0);
3977 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
3978 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
3979 auto Mul2 =
B.buildFMul(
S64,
F, C2, Flags).getReg(0);
3980 T =
B.buildFMA(
S64,
F, C1, Mul2, Flags).getReg(0);
3982 }
else if (
MI.getOpcode() == TargetOpcode::G_FEXP10) {
3983 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.a934f0979a371p+1));
3984 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
3985 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
3987 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
3988 auto C2 =
B.buildFConstant(
S64,
APFloat(-0x1.9dc1da994fd21p-59));
3989 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.34413509f79ffp-2));
3990 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
3991 F =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
3993 auto C4 =
B.buildFConstant(
S64,
APFloat(0x1.26bb1bbb55516p+1));
3994 auto C5 =
B.buildFConstant(
S64,
APFloat(-0x1.f48ad494ea3e9p-53));
3995 auto MulF =
B.buildFMul(
S64,
F, C5, Flags).getReg(0);
3996 T =
B.buildFMA(
S64,
F, C4, MulF, Flags).getReg(0);
3999 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.71547652b82fep+0));
4000 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4001 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4003 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4004 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4005 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4006 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4007 T =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4011 auto P =
B.buildFConstant(
S64, 0x1.ade156a5dcb37p-26);
4012 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.28af3fca7ab0cp-22),
4014 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.71dee623fde64p-19),
4016 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01997c89e6b0p-16),
4018 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01a014761f6ep-13),
4020 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.6c16c1852b7b0p-10),
4022 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.1111111122322p-7), Flags);
4023 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.55555555502a1p-5), Flags);
4024 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.5555555555511p-3), Flags);
4025 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.000000000000bp-1), Flags);
4027 auto One =
B.buildFConstant(
S64, 1.0);
4028 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4029 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4032 auto DnInt =
B.buildFPTOSI(
S32, Dn);
4033 auto Z =
B.buildFLdexp(
S64,
P, DnInt, Flags);
4040 Z =
B.buildSelect(
S64, CondHi, Z, PInf, Flags);
4047 B.buildSelect(
MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4049 MI.eraseFromParent();
4057 const unsigned Flags =
MI.getFlags();
4069 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
4077 MI.eraseFromParent();
4088 auto Ext =
B.buildFPExt(
F32,
X, Flags);
4091 B.buildFPTrunc(Dst, Lowered, Flags);
4092 MI.eraseFromParent();
4103 MI.eraseFromParent();
4131 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
4134 if (ST.hasFastFMAF32()) {
4136 const float cc_exp = 0x1.4ae0bep-26f;
4137 const float c_exp10 = 0x1.a934f0p+1f;
4138 const float cc_exp10 = 0x1.2f346ep-24f;
4140 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4141 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
4142 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
4143 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
4145 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4146 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
4148 const float ch_exp = 0x1.714000p+0f;
4149 const float cl_exp = 0x1.47652ap-12f;
4151 const float ch_exp10 = 0x1.a92000p+1f;
4152 const float cl_exp10 = 0x1.4f0978p-11f;
4154 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
4155 auto XH =
B.buildAnd(Ty,
X, MaskConst);
4156 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
4158 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4159 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
4161 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4162 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
4165 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
4166 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4169 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
4172 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
4173 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
4176 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4177 .addUse(
A.getReg(0))
4179 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
4181 auto UnderflowCheckConst =
4182 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4183 auto Zero =
B.buildFConstant(Ty, 0.0);
4187 R =
B.buildSelect(Ty, Underflow, Zero, R);
4190 auto OverflowCheckConst =
4191 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4196 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
4199 B.buildCopy(Dst, R);
4200 MI.eraseFromParent();
4209 unsigned Flags =
MI.getFlags();
4210 LLT Ty =
B.getMRI()->getType(Dst);
4215 auto Log =
B.buildFLog2(
F32, Src0, Flags);
4216 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4217 .addUse(Log.getReg(0))
4220 B.buildFExp2(Dst,
Mul, Flags);
4221 }
else if (Ty == F16) {
4223 auto Log =
B.buildFLog2(F16, Src0, Flags);
4224 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
4225 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
4226 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4227 .addUse(Ext0.getReg(0))
4228 .addUse(Ext1.getReg(0))
4230 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
4234 MI.eraseFromParent();
4242 ModSrc = SrcFNeg->getOperand(1).getReg();
4244 ModSrc = SrcFAbs->getOperand(1).getReg();
4246 ModSrc = SrcFAbs->getOperand(1).getReg();
4257 Register OrigSrc =
MI.getOperand(1).getReg();
4258 unsigned Flags =
MI.getFlags();
4260 "this should not have been custom lowered");
4270 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
4290 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4292 B.buildFMinNum(Min, Fract, Const, Flags);
4297 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
4300 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
4301 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4303 MI.eraseFromParent();
4319 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4321 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4322 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4325 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4326 B.buildBitcast(Dst,
Merge);
4328 MI.eraseFromParent();
4345 bool UsePartialMad64_32,
4346 bool SeparateOddAlignedProducts)
const {
4361 auto getZero32 = [&]() ->
Register {
4363 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4366 auto getZero64 = [&]() ->
Register {
4368 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4373 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4384 if (CarryIn.empty())
4387 bool HaveCarryOut =
true;
4389 if (CarryIn.size() == 1) {
4391 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4395 CarryAccum = getZero32();
4397 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4398 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4400 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4405 LocalAccum = getZero32();
4406 HaveCarryOut =
false;
4411 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4412 LocalAccum =
Add.getReg(0);
4426 auto buildMadChain =
4429 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4430 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4437 if (LocalAccum.size() == 1 &&
4438 (!UsePartialMad64_32 || !CarryIn.empty())) {
4441 unsigned j1 = DstIndex - j0;
4442 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4446 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4448 LocalAccum[0] =
Mul.getReg(0);
4450 if (CarryIn.empty()) {
4451 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4454 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4460 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4464 if (j0 <= DstIndex) {
4465 bool HaveSmallAccum =
false;
4468 if (LocalAccum[0]) {
4469 if (LocalAccum.size() == 1) {
4470 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4471 HaveSmallAccum =
true;
4472 }
else if (LocalAccum[1]) {
4473 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4474 HaveSmallAccum =
false;
4476 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4477 HaveSmallAccum =
true;
4480 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4482 HaveSmallAccum =
true;
4486 unsigned j1 = DstIndex - j0;
4487 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4491 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4492 {Src0[j0], Src1[j1], Tmp});
4493 Tmp = Mad.getReg(0);
4494 if (!HaveSmallAccum)
4495 CarryOut.push_back(Mad.getReg(1));
4496 HaveSmallAccum =
false;
4499 }
while (j0 <= DstIndex);
4501 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4502 LocalAccum[0] = Unmerge.getReg(0);
4503 if (LocalAccum.size() > 1)
4504 LocalAccum[1] = Unmerge.getReg(1);
4531 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4532 Carry OddCarryIn = std::move(OddCarry);
4533 Carry EvenCarryIn = std::move(EvenCarry);
4538 if (2 * i < Accum.
size()) {
4539 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4540 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4545 if (!SeparateOddAlignedProducts) {
4546 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4547 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4549 bool IsHighest = 2 * i >= Accum.
size();
4552 .take_front(IsHighest ? 1 : 2);
4553 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4559 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4561 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4563 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4566 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4569 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4570 Lo->getOperand(1).getReg());
4571 Accum[2 * i] =
Hi.getReg(0);
4572 SeparateOddCarry =
Hi.getReg(1);
4579 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4580 EvenCarryIn.push_back(CarryOut);
4582 if (2 * i < Accum.
size()) {
4583 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4584 OddCarry.push_back(CarryOut);
4596 assert(ST.hasMad64_32());
4597 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4609 unsigned Size = Ty.getSizeInBits();
4610 if (ST.hasVectorMulU64() &&
Size == 64)
4613 unsigned NumParts =
Size / 32;
4625 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4629 for (
unsigned i = 0; i < NumParts; ++i) {
4633 B.buildUnmerge(Src0Parts, Src0);
4634 B.buildUnmerge(Src1Parts, Src1);
4637 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4638 SeparateOddAlignedProducts);
4640 B.buildMergeLikeInstr(DstReg, AccumRegs);
4641 MI.eraseFromParent();
4656 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4657 ? AMDGPU::G_AMDGPU_FFBH_U32
4658 : AMDGPU::G_AMDGPU_FFBL_B32;
4659 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4662 MI.eraseFromParent();
4672 TypeSize NumBits = SrcTy.getSizeInBits();
4676 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4677 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4678 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4679 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4680 B.buildTrunc(Dst, Ctlz);
4681 MI.eraseFromParent();
4687 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4690 return ConstVal == -1;
4697 Register CondDef =
MI.getOperand(0).getReg();
4716 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4725 UncondBrTarget = &*NextMBB;
4727 if (
Next->getOpcode() != AMDGPU::G_BR)
4746 *ArgRC,
B.getDebugLoc(), ArgTy);
4750 const unsigned Mask = Arg->
getMask();
4758 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4759 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4762 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4764 B.buildCopy(DstReg, LiveIn);
4774 if (!ST.hasClusters()) {
4777 MI.eraseFromParent();
4797 auto One =
B.buildConstant(
S32, 1);
4798 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4799 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4800 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4807 B.buildCopy(DstReg, GlobalIdXYZ);
4808 MI.eraseFromParent();
4812 B.buildCopy(DstReg, ClusterIdXYZ);
4813 MI.eraseFromParent();
4818 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4820 MRI.
setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4821 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4823 .addImm(ClusterIdField);
4824 auto Zero =
B.buildConstant(
S32, 0);
4827 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4828 MI.eraseFromParent();
4870 auto LoadConstant = [&](
unsigned N) {
4871 B.buildConstant(DstReg,
N);
4875 if (ST.hasArchitectedSGPRs() &&
4882 Arg = &WorkGroupIDX;
4883 ArgRC = &AMDGPU::SReg_32RegClass;
4887 Arg = &WorkGroupIDY;
4888 ArgRC = &AMDGPU::SReg_32RegClass;
4892 Arg = &WorkGroupIDZ;
4893 ArgRC = &AMDGPU::SReg_32RegClass;
4897 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4898 return LoadConstant(0);
4899 Arg = &ClusterWorkGroupIDX;
4900 ArgRC = &AMDGPU::SReg_32RegClass;
4904 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
4905 return LoadConstant(0);
4906 Arg = &ClusterWorkGroupIDY;
4907 ArgRC = &AMDGPU::SReg_32RegClass;
4911 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
4912 return LoadConstant(0);
4913 Arg = &ClusterWorkGroupIDZ;
4914 ArgRC = &AMDGPU::SReg_32RegClass;
4919 return LoadConstant(ClusterDims.
getDims()[0] - 1);
4920 Arg = &ClusterWorkGroupMaxIDX;
4921 ArgRC = &AMDGPU::SReg_32RegClass;
4926 return LoadConstant(ClusterDims.
getDims()[1] - 1);
4927 Arg = &ClusterWorkGroupMaxIDY;
4928 ArgRC = &AMDGPU::SReg_32RegClass;
4933 return LoadConstant(ClusterDims.
getDims()[2] - 1);
4934 Arg = &ClusterWorkGroupMaxIDZ;
4935 ArgRC = &AMDGPU::SReg_32RegClass;
4939 Arg = &ClusterWorkGroupMaxFlatID;
4940 ArgRC = &AMDGPU::SReg_32RegClass;
4955 return LoadConstant(0);
4960 B.buildUndef(DstReg);
4964 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4976 MI.eraseFromParent();
4982 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4983 MI.eraseFromParent();
4990 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
5004 B.buildUndef(DstReg);
5005 MI.eraseFromParent();
5009 if (Arg->isMasked()) {
5023 MI.eraseFromParent();
5038 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
5047 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5055 Align Alignment)
const {
5059 "unexpected kernarg parameter type");
5066 MI.eraseFromParent();
5101 auto FloatY =
B.buildUITOFP(
S32,
Y);
5102 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
5104 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
5105 auto Z =
B.buildFPTOUI(
S32, ScaledY);
5108 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
5109 auto NegYZ =
B.buildMul(
S32, NegY, Z);
5110 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
5113 auto Q =
B.buildUMulH(
S32,
X, Z);
5114 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
5117 auto One =
B.buildConstant(
S32, 1);
5120 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
5126 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
5129 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
5148 auto Unmerge =
B.buildUnmerge(
S32, Val);
5150 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
5151 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
5153 auto Mad =
B.buildFMAD(
5157 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
5158 auto Mul1 =
B.buildFMul(
5162 auto Mul2 =
B.buildFMul(
5164 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
5167 auto Mad2 =
B.buildFMAD(
5171 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
5172 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
5174 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5189 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
5191 auto Zero64 =
B.buildConstant(
S64, 0);
5192 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
5194 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
5195 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
5197 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
5198 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5199 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5201 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
5202 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5203 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
5205 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
5206 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
5207 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
5208 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5209 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5211 auto Zero32 =
B.buildConstant(
S32, 0);
5212 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
5213 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5214 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
5216 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
5217 Register NumerLo = UnmergeNumer.getReg(0);
5218 Register NumerHi = UnmergeNumer.getReg(1);
5220 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
5221 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
5222 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
5223 Register Mul3_Lo = UnmergeMul3.getReg(0);
5224 Register Mul3_Hi = UnmergeMul3.getReg(1);
5225 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
5226 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5227 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
5228 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
5230 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
5231 Register DenomLo = UnmergeDenom.getReg(0);
5232 Register DenomHi = UnmergeDenom.getReg(1);
5235 auto C1 =
B.buildSExt(
S32, CmpHi);
5238 auto C2 =
B.buildSExt(
S32, CmpLo);
5241 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
5248 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
5249 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5250 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5251 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
5253 auto One64 =
B.buildConstant(
S64, 1);
5254 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
5260 auto C6 =
B.buildSelect(
5264 auto Add4 =
B.buildAdd(
S64, Add3, One64);
5265 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
5267 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5268 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5269 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
5275 auto Sel1 =
B.buildSelect(
5282 auto Sel2 =
B.buildSelect(
5293 switch (
MI.getOpcode()) {
5296 case AMDGPU::G_UDIV: {
5297 DstDivReg =
MI.getOperand(0).getReg();
5300 case AMDGPU::G_UREM: {
5301 DstRemReg =
MI.getOperand(0).getReg();
5304 case AMDGPU::G_UDIVREM: {
5305 DstDivReg =
MI.getOperand(0).getReg();
5306 DstRemReg =
MI.getOperand(1).getReg();
5313 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5314 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
5315 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5325 MI.eraseFromParent();
5336 if (Ty !=
S32 && Ty !=
S64)
5339 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5340 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5341 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5343 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5344 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5345 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5347 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5348 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5350 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5351 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5353 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5354 switch (
MI.getOpcode()) {
5357 case AMDGPU::G_SDIV: {
5358 DstDivReg =
MI.getOperand(0).getReg();
5362 case AMDGPU::G_SREM: {
5363 DstRemReg =
MI.getOperand(0).getReg();
5367 case AMDGPU::G_SDIVREM: {
5368 DstDivReg =
MI.getOperand(0).getReg();
5369 DstRemReg =
MI.getOperand(1).getReg();
5382 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5383 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5384 B.buildSub(DstDivReg, SignXor, Sign);
5388 auto Sign = LHSign.getReg(0);
5389 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5390 B.buildSub(DstRemReg, SignXor, Sign);
5393 MI.eraseFromParent();
5409 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5420 if (CLHS->isExactlyValue(1.0)) {
5421 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5425 MI.eraseFromParent();
5430 if (CLHS->isExactlyValue(-1.0)) {
5431 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5432 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5433 .addUse(FNeg.getReg(0))
5436 MI.eraseFromParent();
5443 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5448 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5451 B.buildFMul(Res, LHS, RCP, Flags);
5453 MI.eraseFromParent();
5468 if (!AllowInaccurateRcp)
5471 auto NegY =
B.buildFNeg(ResTy,
Y);
5472 auto One =
B.buildFConstant(ResTy, 1.0);
5474 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5478 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5479 R =
B.buildFMA(ResTy, Tmp0, R, R);
5481 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5482 R =
B.buildFMA(ResTy, Tmp1, R, R);
5484 auto Ret =
B.buildFMul(ResTy,
X, R);
5485 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5487 B.buildFMA(Res, Tmp2, R, Ret);
5488 MI.eraseFromParent();
5520 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5521 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5522 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5523 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5524 .addUse(RHSExt.getReg(0))
5526 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5528 if (ST.hasMadMacF32Insts()) {
5529 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5530 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5531 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5533 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5534 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5535 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5537 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5538 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5539 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5540 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5541 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5542 .addUse(RDst.getReg(0))
5547 MI.eraseFromParent();
5560 unsigned SPDenormMode =
5563 if (ST.hasDenormModeInst()) {
5565 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5567 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5568 B.buildInstr(AMDGPU::S_DENORM_MODE)
5569 .addImm(NewDenormModeValue);
5572 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5573 .addImm(SPDenormMode)
5595 auto One =
B.buildFConstant(
S32, 1.0f);
5597 auto DenominatorScaled =
5598 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5603 auto NumeratorScaled =
5604 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5610 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5611 .addUse(DenominatorScaled.getReg(0))
5613 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5616 const bool HasDynamicDenormals =
5621 if (!PreservesDenormals) {
5622 if (HasDynamicDenormals) {
5624 B.buildInstr(AMDGPU::S_GETREG_B32)
5625 .addDef(SavedSPDenormMode)
5631 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5632 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5633 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5634 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5635 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5636 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5638 if (!PreservesDenormals) {
5639 if (HasDynamicDenormals) {
5640 assert(SavedSPDenormMode);
5641 B.buildInstr(AMDGPU::S_SETREG_B32)
5642 .addReg(SavedSPDenormMode)
5648 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5649 .addUse(Fma4.getReg(0))
5650 .addUse(Fma1.getReg(0))
5651 .addUse(Fma3.getReg(0))
5652 .addUse(NumeratorScaled.getReg(1))
5655 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5656 .addUse(Fmas.getReg(0))
5661 MI.eraseFromParent();
5680 auto One =
B.buildFConstant(
S64, 1.0);
5682 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5688 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5690 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5691 .addUse(DivScale0.getReg(0))
5694 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5695 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5696 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5698 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5704 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5705 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5706 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5709 if (!ST.hasUsableDivScaleConditionOutput()) {
5715 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5716 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5717 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5718 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5721 Scale1Unmerge.getReg(1));
5723 Scale0Unmerge.getReg(1));
5724 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5726 Scale = DivScale1.getReg(1);
5729 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5730 .addUse(Fma4.getReg(0))
5731 .addUse(Fma3.getReg(0))
5732 .addUse(
Mul.getReg(0))
5736 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5737 .addUse(Fmas.getReg(0))
5742 MI.eraseFromParent();
5757 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5760 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5764 if (ST.hasFractBug()) {
5765 auto Fabs =
B.buildFAbs(Ty, Val);
5769 auto Zero =
B.buildConstant(InstrExpTy, 0);
5770 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5771 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5774 B.buildCopy(Res0, Mant);
5775 B.buildSExtOrTrunc(Res1, Exp);
5777 MI.eraseFromParent();
5792 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5795 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5796 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5797 auto C2 =
B.buildFConstant(
S32, 1.0f);
5800 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5802 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5804 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5805 .addUse(Mul0.getReg(0))
5808 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5810 B.buildFMul(Res, Sel, Mul1, Flags);
5812 MI.eraseFromParent();
5821 unsigned Flags =
MI.getFlags();
5822 assert(!ST.has16BitInsts());
5824 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5825 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5826 .addUse(Ext.getReg(0))
5828 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5829 MI.eraseFromParent();
5839 const unsigned Flags =
MI.getFlags();
5848 MI.eraseFromParent();
5852 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5854 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5855 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5856 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5861 .addUse(SqrtX.getReg(0))
5864 auto NegOne =
B.buildConstant(I32, -1);
5865 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5867 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5868 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5870 auto PosOne =
B.buildConstant(I32, 1);
5871 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5873 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5874 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5876 auto Zero =
B.buildFConstant(
F32, 0.0f);
5880 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5884 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5887 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5888 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5890 auto Half =
B.buildFConstant(
F32, 0.5f);
5891 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5892 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5893 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5894 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5895 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5896 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5897 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5898 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5901 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5903 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5905 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5908 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5910 MI.eraseFromParent();
5945 unsigned Flags =
MI.getFlags();
5947 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
5949 auto ZeroInt =
B.buildConstant(
S32, 0);
5953 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
5954 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
5955 auto SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags);
5958 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX.getReg(0));
5960 auto Half =
B.buildFConstant(
F64, 0.5);
5961 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
5962 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
5964 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
5965 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
5967 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
5968 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
5970 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
5971 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
5973 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
5975 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
5976 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
5978 auto SqrtRet =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
5981 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
5982 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
5983 SqrtRet =
B.buildFLdexp(
F64, SqrtRet, ScaleDown, Flags);
5992 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5994 MI.eraseFromParent();
6025 auto Flags =
MI.getFlags();
6037 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6047 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6048 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6053 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6055 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6056 MI.eraseFromParent();
6068 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6069 IID == Intrinsic::amdgcn_permlanex16;
6070 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6071 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6075 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
6077 case Intrinsic::amdgcn_readfirstlane:
6078 case Intrinsic::amdgcn_permlane64:
6079 return LaneOp.getReg(0);
6080 case Intrinsic::amdgcn_readlane:
6081 case Intrinsic::amdgcn_set_inactive:
6082 case Intrinsic::amdgcn_set_inactive_chain_arg:
6083 return LaneOp.addUse(Src1).getReg(0);
6084 case Intrinsic::amdgcn_writelane:
6085 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6086 case Intrinsic::amdgcn_permlane16:
6087 case Intrinsic::amdgcn_permlanex16: {
6089 int64_t Src4 =
MI.getOperand(6).getImm();
6090 int64_t Src5 =
MI.getOperand(7).getImm();
6091 return LaneOp.addUse(Src1)
6098 case Intrinsic::amdgcn_mov_dpp8:
6099 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
6100 case Intrinsic::amdgcn_update_dpp:
6101 return LaneOp.addUse(Src1)
6102 .addImm(
MI.getOperand(4).getImm())
6103 .addImm(
MI.getOperand(5).getImm())
6104 .addImm(
MI.getOperand(6).getImm())
6105 .addImm(
MI.getOperand(7).getImm())
6115 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6116 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6117 Src1 =
MI.getOperand(3).getReg();
6118 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
6119 Src2 =
MI.getOperand(4).getReg();
6124 unsigned Size = Ty.getSizeInBits();
6126 unsigned SplitSize = 32;
6127 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
6128 ST.hasDPALU_DPP() &&
6132 if (
Size == SplitSize) {
6138 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
6140 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6143 if (IID == Intrinsic::amdgcn_writelane)
6146 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
6147 B.buildTrunc(DstReg, LaneOpDst);
6148 MI.eraseFromParent();
6152 if (
Size % SplitSize != 0)
6156 bool NeedsBitcast =
false;
6157 if (Ty.isVector()) {
6160 if (EltSize == SplitSize) {
6161 PartialResTy = EltTy;
6162 }
else if (EltSize == 16 || EltSize == 32) {
6163 unsigned NElem = SplitSize / EltSize;
6167 NeedsBitcast =
true;
6172 unsigned NumParts =
Size / SplitSize;
6176 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6177 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
6179 if (IID == Intrinsic::amdgcn_writelane)
6180 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
6182 for (
unsigned i = 0; i < NumParts; ++i) {
6183 Src0 = Src0Parts.
getReg(i);
6185 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6186 Src1 = Src1Parts.
getReg(i);
6188 if (IID == Intrinsic::amdgcn_writelane)
6189 Src2 = Src2Parts.
getReg(i);
6191 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6195 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
6198 B.buildMergeLikeInstr(DstReg, PartialRes);
6200 MI.eraseFromParent();
6208 ST.getTargetLowering()->getImplicitParameterOffset(
6218 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6219 B.buildConstant(IdxTy,
Offset).getReg(0));
6230 Register Pointer =
MI.getOperand(2).getReg();
6232 Register NumRecords =
MI.getOperand(4).getReg();
6238 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6240 auto ExtStride =
B.buildAnyExt(
S32, Stride);
6242 if (ST.has45BitNumRecordsBufferResource()) {
6247 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
6248 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
6249 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
6250 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
6254 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
6255 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
6256 auto ExtShiftedStride =
6257 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
6258 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
6259 auto ExtShiftedFlags =
6260 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
6261 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
6263 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
6264 B.buildMergeValues(Result, {LowHalf, HighHalf});
6266 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
6267 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
6268 auto LowHalf = Unmerge.getReg(0);
6269 auto HighHalf = Unmerge.getReg(1);
6271 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
6272 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
6273 auto ShiftConst =
B.buildConstant(
S32, 16);
6274 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
6275 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
6276 Register NewHighHalfReg = NewHighHalf.getReg(0);
6277 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6280 MI.eraseFromParent();
6297 MI.eraseFromParent();
6305 std::optional<uint32_t> KnownSize =
6307 if (KnownSize.has_value())
6308 B.buildConstant(DstReg, *KnownSize);
6326 MI.eraseFromParent();
6333 unsigned AddrSpace)
const {
6335 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6339 ST.hasGloballyAddressableScratch()) {
6341 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6342 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6344 MRI.
setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6346 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6348 B.buildConstant(
S32, 1u << 26));
6353 MI.eraseFromParent();
6363std::pair<Register, unsigned>
6375 bool CheckNUW = ST.hasGFX1250Insts();
6377 MRI, OrigOffset,
nullptr, CheckNUW);
6381 BaseReg =
B.buildPtrToInt(MRI.
getType(OrigOffset), BaseReg).getReg(0);
6391 unsigned Overflow = ImmOffset & ~MaxImm;
6392 ImmOffset -= Overflow;
6393 if ((int32_t)Overflow < 0) {
6394 Overflow += ImmOffset;
6398 if (Overflow != 0) {
6400 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6402 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6403 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6408 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6410 return std::pair(BaseReg, ImmOffset);
6417 bool ImageStore)
const {
6423 if (ST.hasUnpackedD16VMem()) {
6424 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6427 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6428 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6436 if (ImageStore && ST.hasImageStoreD16Bug()) {
6439 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6441 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6448 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6449 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6451 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6459 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6460 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6462 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6479 bool IsFormat)
const {
6491 VData =
B.buildBitcast(Ty, VData).getReg(0);
6499 if (Ty.isVector()) {
6500 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6512 bool IsFormat)
const {
6519 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6534 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6537 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6541 VIndex =
MI.getOperand(3).getReg();
6544 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6547 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6548 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6552 Format =
MI.getOperand(5 + OpOffset).getImm();
6556 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6562 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6563 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6564 }
else if (IsFormat) {
6565 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6566 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6570 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6573 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6576 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6581 auto MIB =
B.buildInstr(
Opc)
6592 MIB.addImm(AuxiliaryData)
6593 .addImm(HasVIndex ? -1 : 0)
6594 .addMemOperand(MMO);
6596 MI.eraseFromParent();
6602 unsigned ImmOffset,
unsigned Format,
6605 auto MIB =
B.buildInstr(
Opc)
6616 MIB.addImm(AuxiliaryData)
6617 .addImm(HasVIndex ? -1 : 0)
6618 .addMemOperand(MMO);
6624 bool IsTyped)
const {
6638 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6639 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6641 StatusDst =
MI.getOperand(1).getReg();
6646 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6649 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6652 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6655 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6658 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6661 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6662 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6666 Format =
MI.getOperand(5 + OpOffset).getImm();
6670 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6680 Dst =
MI.getOperand(0).getReg();
6681 B.setInsertPt(
B.getMBB(),
MI);
6688 Dst =
MI.getOperand(0).getReg();
6689 B.setInsertPt(
B.getMBB(),
MI);
6693 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6694 const bool Unpacked = ST.hasUnpackedD16VMem();
6704 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6705 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6706 }
else if (IsFormat) {
6710 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6712 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6713 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6718 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6719 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6722 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6723 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6726 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6727 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6733 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6734 unsigned NumLoadDWords = NumValueDWords + 1;
6736 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6738 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6740 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6741 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6742 B.buildTrunc(Dst, ExtDst);
6743 }
else if (NumValueDWords == 1) {
6744 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6747 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6748 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6750 B.buildUnmerge(LoadElts, LoadDstReg);
6752 B.buildMergeLikeInstr(Dst, LoadElts);
6755 (IsD16 && !Ty.isVector())) {
6756 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6758 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6759 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6760 B.buildTrunc(Dst, LoadDstReg);
6761 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6763 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6765 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6766 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6768 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6770 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6771 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6772 B.buildMergeLikeInstr(Dst, Repack);
6775 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6778 MI.eraseFromParent();
6784 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6785 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6786 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6787 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6788 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6789 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6790 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6791 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6792 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6793 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6794 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6795 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6796 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6797 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6798 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6799 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6800 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6801 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6802 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6803 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6804 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6805 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6806 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6807 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6808 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6809 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6810 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6811 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6812 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6813 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6814 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6815 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6816 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6817 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6818 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6819 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6820 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6821 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6822 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6823 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6824 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6825 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6826 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6827 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6828 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6829 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6830 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6831 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6832 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6833 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6834 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6835 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6836 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6837 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6838 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6839 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6840 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6841 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6842 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6843 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6844 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6845 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6846 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6847 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6848 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6849 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6850 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6851 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6852 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6853 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6854 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6855 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6856 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6857 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6858 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6859 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6860 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6861 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6862 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6863 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6864 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6865 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6866 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6867 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6868 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6869 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6870 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6871 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6872 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6873 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6882 const bool IsCmpSwap =
6883 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6884 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6885 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6886 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6897 CmpVal =
MI.getOperand(3).getReg();
6902 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6903 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6906 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6909 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6912 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6915 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6916 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6917 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6936 .addImm(AuxiliaryData)
6937 .addImm(HasVIndex ? -1 : 0)
6938 .addMemOperand(MMO);
6940 MI.eraseFromParent();
6950 bool IsA16,
bool IsG16) {
6966 (
B.getMRI()->getType(AddrReg) ==
S16)) {
6971 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6975 "Bias needs to be converted to 16 bit in A16 mode");
6977 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
6983 if (((
I + 1) >= EndIdx) ||
6990 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
6992 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6997 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7008 int DimIdx,
int NumVAddrs) {
7012 for (
int I = 0;
I != NumVAddrs; ++
I) {
7014 if (
SrcOp.isReg()) {
7020 int NumAddrRegs = AddrRegs.
size();
7021 if (NumAddrRegs != 1) {
7024 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7027 for (
int I = 1;
I != NumVAddrs; ++
I) {
7030 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
7052 const unsigned NumDefs =
MI.getNumExplicitDefs();
7053 const unsigned ArgOffset = NumDefs + 1;
7054 bool IsTFE = NumDefs == 2;
7072 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7076 const bool IsAtomicPacked16Bit =
7077 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7078 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7086 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
7087 const bool IsA16 = AddrTy ==
S16;
7088 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
7091 if (!BaseOpcode->
Atomic) {
7092 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
7095 }
else if (DMask != 0) {
7097 }
else if (!IsTFE && !BaseOpcode->
Store) {
7099 B.buildUndef(
MI.getOperand(0));
7100 MI.eraseFromParent();
7108 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7109 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7110 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7111 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7112 unsigned NewOpcode = LoadOpcode;
7113 if (BaseOpcode->
Store)
7114 NewOpcode = StoreOpcode;
7116 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7119 MI.setDesc(
B.getTII().get(NewOpcode));
7123 if (IsTFE && DMask == 0) {
7126 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
7129 if (BaseOpcode->
Atomic) {
7134 if (Ty.isVector() && !IsAtomicPacked16Bit)
7141 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
7142 MI.getOperand(2).setReg(
Concat.getReg(0));
7143 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7147 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
7150 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7156 if (IsA16 && !ST.hasA16()) {
7161 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
7162 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7164 if (IsA16 || IsG16) {
7172 const bool UseNSA = ST.hasNSAEncoding() &&
7173 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
7174 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
7175 const bool UsePartialNSA =
7176 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
7178 if (UsePartialNSA) {
7182 auto Concat =
B.buildConcatVectors(
7183 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7184 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
7185 PackedRegs.
resize(NSAMaxSize);
7186 }
else if (!UseNSA && PackedRegs.
size() > 1) {
7188 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
7189 PackedRegs[0] =
Concat.getReg(0);
7193 const unsigned NumPacked = PackedRegs.
size();
7196 if (!
SrcOp.isReg()) {
7206 SrcOp.setReg(AMDGPU::NoRegister);
7223 const bool UseNSA = ST.hasNSAEncoding() &&
7224 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7225 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7226 const bool UsePartialNSA =
7227 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7229 if (UsePartialNSA) {
7231 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
7233 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
7248 if (!Ty.isVector() || !IsD16)
7252 if (RepackedReg != VData) {
7253 MI.getOperand(1).setReg(RepackedReg);
7261 const int NumElts = Ty.
isVector() ? Ty.getNumElements() : 1;
7264 if (NumElts < DMaskLanes)
7267 if (NumElts > 4 || DMaskLanes > 4)
7277 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7278 const LLT AdjustedTy =
7294 if (IsD16 && ST.hasUnpackedD16VMem()) {
7301 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
7302 unsigned RoundedSize = 32 * RoundedElts;
7306 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
7311 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
7317 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
7321 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7322 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
7326 MI.getOperand(0).setReg(NewResultReg);
7334 Dst1Reg =
MI.getOperand(1).getReg();
7339 MI.removeOperand(1);
7343 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7352 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7354 if (ResultNumRegs == 1) {
7356 ResultRegs[0] = NewResultReg;
7359 for (
int I = 0;
I != NumDataRegs; ++
I)
7361 B.buildUnmerge(ResultRegs, NewResultReg);
7366 ResultRegs.
resize(NumDataRegs);
7371 if (IsD16 && !Ty.isVector()) {
7372 B.buildTrunc(DstReg, ResultRegs[0]);
7377 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7378 B.buildBitcast(DstReg, ResultRegs[0]);
7390 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7392 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7393 }
else if (ST.hasUnpackedD16VMem()) {
7395 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7399 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7403 for (
int I = 0;
I != NumElts; ++
I)
7410 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7411 B.buildBuildVector(DstReg, ResultRegs);
7415 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7416 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7422 if (ResultRegs.
size() == 1) {
7423 NewResultReg = ResultRegs[0];
7424 }
else if (ResultRegs.
size() == 2) {
7426 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7434 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7436 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7441 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7442 B.buildConcatVectors(DstReg, ResultRegs);
7451 Register OrigDst =
MI.getOperand(0).getReg();
7453 LLT Ty =
B.getMRI()->getType(OrigDst);
7454 unsigned Size = Ty.getSizeInBits();
7457 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7459 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7460 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7463 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7465 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7474 B.setInsertPt(
B.getMBB(),
MI);
7479 B.setInsertPt(
B.getMBB(),
MI);
7485 MI.setDesc(
B.getTII().get(
Opc));
7486 MI.removeOperand(1);
7489 const unsigned MemSize = (
Size + 7) / 8;
7490 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7497 MI.addMemOperand(MF, MMO);
7498 if (Dst != OrigDst) {
7499 MI.getOperand(0).setReg(Dst);
7500 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7501 B.buildTrunc(OrigDst, Dst);
7523 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7524 MI.removeOperand(0);
7534 if (!ST.hasTrapHandler() ||
7538 return ST.supportsGetDoorbellID() ?
7551 MI.eraseFromParent();
7561 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7563 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7567 MI.eraseFromParent();
7576 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7583 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7603 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7606 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7607 B.buildCopy(SGPR01, Temp);
7608 B.buildInstr(AMDGPU::S_TRAP)
7611 MI.eraseFromParent();
7622 B.buildCopy(SGPR01, LiveIn);
7623 B.buildInstr(AMDGPU::S_TRAP)
7627 MI.eraseFromParent();
7636 if (ST.hasPrivEnabledTrap2NopBug()) {
7637 ST.getInstrInfo()->insertSimulatedTrap(MRI,
B.getMBB(),
MI,
7639 MI.eraseFromParent();
7643 B.buildInstr(AMDGPU::S_TRAP)
7645 MI.eraseFromParent();
7654 if (!ST.hasTrapHandler() ||
7658 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7661 B.buildInstr(AMDGPU::S_TRAP)
7665 MI.eraseFromParent();
7678 Register NodePtr =
MI.getOperand(2).getReg();
7679 Register RayExtent =
MI.getOperand(3).getReg();
7680 Register RayOrigin =
MI.getOperand(4).getReg();
7682 Register RayInvDir =
MI.getOperand(6).getReg();
7685 if (!ST.hasGFX10_AEncoding()) {
7688 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7697 const unsigned NumVDataDwords = 4;
7698 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7699 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7701 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7703 const unsigned BaseOpcodes[2][2] = {
7704 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7705 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7706 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7710 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7711 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7712 : AMDGPU::MIMGEncGfx10NSA,
7713 NumVDataDwords, NumVAddrDwords);
7717 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7718 : AMDGPU::MIMGEncGfx10Default,
7719 NumVDataDwords, NumVAddrDwords);
7724 if (UseNSA && IsGFX11Plus) {
7726 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7727 auto Merged =
B.buildMergeLikeInstr(
7728 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7729 Ops.push_back(Merged.getReg(0));
7732 Ops.push_back(NodePtr);
7733 Ops.push_back(RayExtent);
7734 packLanes(RayOrigin);
7737 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7738 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7739 auto MergedDir =
B.buildMergeLikeInstr(
7742 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7743 UnmergeRayDir.getReg(0)}))
7746 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7747 UnmergeRayDir.getReg(1)}))
7750 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7751 UnmergeRayDir.getReg(2)}))
7753 Ops.push_back(MergedDir.getReg(0));
7756 packLanes(RayInvDir);
7760 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7761 Ops.push_back(Unmerge.getReg(0));
7762 Ops.push_back(Unmerge.getReg(1));
7764 Ops.push_back(NodePtr);
7766 Ops.push_back(RayExtent);
7769 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7770 Ops.push_back(Unmerge.getReg(0));
7771 Ops.push_back(Unmerge.getReg(1));
7772 Ops.push_back(Unmerge.getReg(2));
7775 packLanes(RayOrigin);
7777 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7778 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7782 B.buildMergeLikeInstr(R1,
7783 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7784 B.buildMergeLikeInstr(
7785 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7786 B.buildMergeLikeInstr(
7787 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7793 packLanes(RayInvDir);
7800 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7802 Ops.push_back(MergedOps);
7805 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7814 .addImm(IsA16 ? 1 : 0)
7817 MI.eraseFromParent();
7827 Register DstOrigin =
MI.getOperand(1).getReg();
7829 Register NodePtr =
MI.getOperand(4).getReg();
7830 Register RayExtent =
MI.getOperand(5).getReg();
7831 Register InstanceMask =
MI.getOperand(6).getReg();
7832 Register RayOrigin =
MI.getOperand(7).getReg();
7834 Register Offsets =
MI.getOperand(9).getReg();
7835 Register TDescr =
MI.getOperand(10).getReg();
7837 if (!ST.hasBVHDualAndBVH8Insts()) {
7840 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7845 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7846 const unsigned NumVDataDwords = 10;
7847 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7849 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7850 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7851 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7854 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7855 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7857 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7858 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7864 .addUse(RayExtentInstanceMaskVec.getReg(0))
7871 MI.eraseFromParent();
7880 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7881 MI.eraseFromParent();
7888 if (!ST.hasArchitectedSGPRs())
7892 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7893 auto LSB =
B.buildConstant(
S32, 25);
7894 auto Width =
B.buildConstant(
S32, 5);
7895 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7896 MI.eraseFromParent();
7904 unsigned Width)
const {
7908 MRI.
setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7909 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7912 MI.eraseFromParent();
7930 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7934 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7937 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7938 MI.eraseFromParent();
7949 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
7953 .addReg(Unmerge.getReg(0));
7957 .addReg(Unmerge.getReg(1));
7958 MI.eraseFromParent();
7970 case Intrinsic::sponentry:
7976 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
7979 B.buildIntToPtr(DstReg, TmpReg);
7980 MI.eraseFromParent();
7982 int FI =
B.getMF().getFrameInfo().CreateFixedObject(
7984 B.buildFrameIndex(
MI.getOperand(0), FI);
7985 MI.eraseFromParent();
7988 case Intrinsic::amdgcn_if:
7989 case Intrinsic::amdgcn_else: {
7992 bool Negated =
false;
8004 std::swap(CondBrTarget, UncondBrTarget);
8006 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8007 if (IntrID == Intrinsic::amdgcn_if) {
8008 B.buildInstr(AMDGPU::SI_IF)
8011 .addMBB(UncondBrTarget);
8013 B.buildInstr(AMDGPU::SI_ELSE)
8016 .addMBB(UncondBrTarget);
8025 B.buildBr(*CondBrTarget);
8030 MI.eraseFromParent();
8031 BrCond->eraseFromParent();
8037 case Intrinsic::amdgcn_loop: {
8040 bool Negated =
false;
8050 std::swap(CondBrTarget, UncondBrTarget);
8052 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8053 B.buildInstr(AMDGPU::SI_LOOP)
8055 .addMBB(UncondBrTarget);
8060 B.buildBr(*CondBrTarget);
8062 MI.eraseFromParent();
8063 BrCond->eraseFromParent();
8070 case Intrinsic::amdgcn_addrspacecast_nonnull:
8072 case Intrinsic::amdgcn_make_buffer_rsrc:
8074 case Intrinsic::amdgcn_kernarg_segment_ptr:
8077 B.buildConstant(
MI.getOperand(0).getReg(), 0);
8078 MI.eraseFromParent();
8084 case Intrinsic::amdgcn_implicitarg_ptr:
8086 case Intrinsic::amdgcn_workitem_id_x:
8089 case Intrinsic::amdgcn_workitem_id_y:
8092 case Intrinsic::amdgcn_workitem_id_z:
8095 case Intrinsic::amdgcn_workgroup_id_x:
8100 case Intrinsic::amdgcn_workgroup_id_y:
8105 case Intrinsic::amdgcn_workgroup_id_z:
8110 case Intrinsic::amdgcn_cluster_id_x:
8111 return ST.hasClusters() &&
8114 case Intrinsic::amdgcn_cluster_id_y:
8115 return ST.hasClusters() &&
8118 case Intrinsic::amdgcn_cluster_id_z:
8119 return ST.hasClusters() &&
8122 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8123 return ST.hasClusters() &&
8126 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8127 return ST.hasClusters() &&
8130 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8131 return ST.hasClusters() &&
8134 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8135 return ST.hasClusters() &&
8137 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8138 return ST.hasClusters() &&
8141 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8142 return ST.hasClusters() &&
8145 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8146 return ST.hasClusters() &&
8149 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8150 return ST.hasClusters() &&
8154 case Intrinsic::amdgcn_wave_id:
8156 case Intrinsic::amdgcn_lds_kernel_id:
8159 case Intrinsic::amdgcn_dispatch_ptr:
8162 case Intrinsic::amdgcn_queue_ptr:
8165 case Intrinsic::amdgcn_implicit_buffer_ptr:
8168 case Intrinsic::amdgcn_dispatch_id:
8171 case Intrinsic::r600_read_ngroups_x:
8175 case Intrinsic::r600_read_ngroups_y:
8178 case Intrinsic::r600_read_ngroups_z:
8181 case Intrinsic::r600_read_local_size_x:
8184 case Intrinsic::r600_read_local_size_y:
8188 case Intrinsic::r600_read_local_size_z:
8191 case Intrinsic::amdgcn_fdiv_fast:
8193 case Intrinsic::amdgcn_is_shared:
8195 case Intrinsic::amdgcn_is_private:
8197 case Intrinsic::amdgcn_wavefrontsize: {
8198 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
8199 MI.eraseFromParent();
8202 case Intrinsic::amdgcn_s_buffer_load:
8204 case Intrinsic::amdgcn_raw_buffer_store:
8205 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8206 case Intrinsic::amdgcn_struct_buffer_store:
8207 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8209 case Intrinsic::amdgcn_raw_buffer_store_format:
8210 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8211 case Intrinsic::amdgcn_struct_buffer_store_format:
8212 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8214 case Intrinsic::amdgcn_raw_tbuffer_store:
8215 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8216 case Intrinsic::amdgcn_struct_tbuffer_store:
8217 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8219 case Intrinsic::amdgcn_raw_buffer_load:
8220 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8221 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8222 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8223 case Intrinsic::amdgcn_struct_buffer_load:
8224 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8225 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8226 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8228 case Intrinsic::amdgcn_raw_buffer_load_format:
8229 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8230 case Intrinsic::amdgcn_struct_buffer_load_format:
8231 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8233 case Intrinsic::amdgcn_raw_tbuffer_load:
8234 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8235 case Intrinsic::amdgcn_struct_tbuffer_load:
8236 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8238 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8239 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8240 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8241 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8242 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8243 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8244 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8245 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8246 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8247 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8248 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8249 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8250 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8251 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8252 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8253 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8254 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8255 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8256 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8257 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8258 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8259 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8260 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8261 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8262 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8263 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8264 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8265 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8266 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8267 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8268 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8269 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8270 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8271 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8272 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8273 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8274 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8275 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8276 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8277 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8278 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8279 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8280 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8281 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8282 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8283 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8284 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8285 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8286 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8287 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8288 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8289 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8290 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8291 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8292 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8293 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8294 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8295 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8296 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8297 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8298 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8299 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8300 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8301 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8302 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8303 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8304 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8305 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8306 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8307 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8308 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8309 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8311 case Intrinsic::amdgcn_rsq_clamp:
8313 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8315 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8316 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8318 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8319 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8320 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8321 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8322 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8323 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8324 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8325 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8329 if (IndexArgTy !=
S64) {
8330 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(
S64, Index)
8331 :
B.buildAnyExt(
S64, Index);
8332 MI.getOperand(5).setReg(NewIndex.getReg(0));
8336 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8337 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8338 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8339 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8340 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8341 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8342 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8343 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8347 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
8350 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8351 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8352 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8353 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8354 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8355 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8356 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8357 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8358 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8360 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8364 if (IndexArgTy != IdxTy) {
8365 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(IdxTy, Index)
8366 :
B.buildAnyExt(IdxTy, Index);
8367 MI.getOperand(7).setReg(NewIndex.getReg(0));
8372 case Intrinsic::amdgcn_fmed3: {
8378 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8379 MI.removeOperand(1);
8383 case Intrinsic::amdgcn_readlane:
8384 case Intrinsic::amdgcn_writelane:
8385 case Intrinsic::amdgcn_readfirstlane:
8386 case Intrinsic::amdgcn_permlane16:
8387 case Intrinsic::amdgcn_permlanex16:
8388 case Intrinsic::amdgcn_permlane64:
8389 case Intrinsic::amdgcn_set_inactive:
8390 case Intrinsic::amdgcn_set_inactive_chain_arg:
8391 case Intrinsic::amdgcn_mov_dpp8:
8392 case Intrinsic::amdgcn_update_dpp:
8394 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8396 case Intrinsic::amdgcn_dead: {
8400 MI.eraseFromParent();
8403 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8404 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8405 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8406 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8407 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8408 MI.eraseFromParent();
8410 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8411 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8412 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8413 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8414 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8415 MI.eraseFromParent();
8417 case Intrinsic::amdgcn_flat_load_monitor_b32:
8418 case Intrinsic::amdgcn_flat_load_monitor_b64:
8419 case Intrinsic::amdgcn_flat_load_monitor_b128:
8420 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8421 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8422 .add(
MI.getOperand(0))
8423 .add(
MI.getOperand(2))
8424 .addMemOperand(*
MI.memoperands_begin());
8425 MI.eraseFromParent();
8427 case Intrinsic::amdgcn_global_load_monitor_b32:
8428 case Intrinsic::amdgcn_global_load_monitor_b64:
8429 case Intrinsic::amdgcn_global_load_monitor_b128:
8430 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8431 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8432 .add(
MI.getOperand(0))
8433 .add(
MI.getOperand(2))
8434 .addMemOperand(*
MI.memoperands_begin());
8435 MI.eraseFromParent();
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
bool isModuleEntryFunction() const
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isBottomOfStack() const
bool isEntryFunction() const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.