37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
40#define DEBUG_TYPE "amdgpu-legalinfo"
50 "amdgpu-global-isel-new-legality",
51 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
67 unsigned Bits = Ty.getSizeInBits();
77 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
91 const LLT Ty = Query.Types[TypeIdx];
98 const LLT Ty = Query.Types[TypeIdx];
100 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
106 const LLT Ty = Query.Types[TypeIdx];
108 return std::pair(TypeIdx,
115 const LLT Ty = Query.Types[TypeIdx];
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (
Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
129 const LLT Ty = Query.Types[TypeIdx];
132 const int Size = Ty.getSizeInBits();
134 const int NextMul32 = (
Size + 31) / 32;
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
154 const LLT Ty = Query.Types[TypeIdx];
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
159 assert(EltSize == 32 || EltSize == 64);
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
168 return std::pair(TypeIdx,
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
188 const unsigned Size = Ty.getSizeInBits();
201 const LLT Ty = Query.Types[TypeIdx];
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
218 const LLT QueryTy = Query.Types[TypeIdx];
225 const LLT QueryTy = Query.Types[TypeIdx];
232 const LLT QueryTy = Query.Types[TypeIdx];
238 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
244 return EltSize == 16 || EltSize % 32 == 0;
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
280 LLT Ty = Query.Types[TypeIdx];
288 const LLT QueryTy = Query.Types[TypeIdx];
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
378 (ST.useRealTrue16Insts() && Ty ==
S16) ||
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
413 bool IsLoad,
bool IsAtomic) {
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
430 return IsLoad ? 512 : 128;
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
444 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
446 unsigned RegSize = Ty.getSizeInBits();
449 unsigned AS = Query.
Types[1].getAddressSpace();
456 if (Ty.isVector() && MemSize !=
RegSize)
463 if (IsLoad && MemSize <
Size)
464 MemSize = std::max(MemSize,
Align);
484 if (!ST.hasDwordx3LoadStores())
497 if (AlignBits < MemSize) {
500 Align(AlignBits / 8)))
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
556 const unsigned Size = Ty.getSizeInBits();
557 if (
Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
564 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
573 uint64_t AlignInBits,
unsigned AddrSpace,
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
594 if (AlignInBits < RoundedSize)
601 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
613 Query.
Types[1].getAddressSpace(), Opcode);
633 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
639 for (
unsigned I = 0;
I < NumParts; ++
I)
641 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
647 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
648 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
669 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
670 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
671 for (
unsigned I = 0;
I < NumParts; ++
I)
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
675 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
695 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
708 const LLT BufferStridedPtr =
711 const LLT CodePtr = FlatPtr;
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
723 const std::initializer_list<LLT> FPTypesBase = {
727 const std::initializer_list<LLT> FPTypes16 = {
731 const std::initializer_list<LLT> FPTypesPK16 = {
735 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
756 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
758 if (ST.hasScalarAddSub64()) {
761 .clampMaxNumElementsStrict(0,
S16, 2)
769 .clampMaxNumElementsStrict(0,
S16, 2)
776 if (ST.hasScalarSMulU64()) {
779 .clampMaxNumElementsStrict(0,
S16, 2)
787 .clampMaxNumElementsStrict(0,
S16, 2)
797 .minScalarOrElt(0,
S16)
802 }
else if (ST.has16BitInsts()) {
836 .widenScalarToNextMultipleOf(0, 32)
846 if (ST.hasMad64_32())
851 if (ST.hasIntClamp()) {
874 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
884 if (ST.hasVOP3PInsts()) {
886 .clampMaxNumElements(0,
S8, 2)
907 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
919 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
926 .clampScalar(0,
S16,
S64);
959 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
960 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
967 if (ST.has16BitInsts()) {
968 if (ST.hasVOP3PInsts())
971 FPOpActions.legalFor({
S16});
973 TrigActions.customFor({
S16});
974 FDIVActions.customFor({
S16});
977 if (ST.hasPackedFP32Ops()) {
978 FPOpActions.legalFor({
V2S32});
979 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
982 auto &MinNumMaxNumIeee =
985 if (ST.hasVOP3PInsts()) {
986 MinNumMaxNumIeee.legalFor(FPTypesPK16)
988 .clampMaxNumElements(0,
S16, 2)
991 }
else if (ST.has16BitInsts()) {
992 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
994 MinNumMaxNumIeee.legalFor(FPTypesBase)
1000 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1002 if (ST.hasVOP3PInsts()) {
1003 MinNumMaxNum.customFor(FPTypesPK16)
1005 .clampMaxNumElements(0,
S16, 2)
1006 .clampScalar(0,
S16,
S64)
1008 }
else if (ST.has16BitInsts()) {
1009 MinNumMaxNum.customFor(FPTypes16)
1010 .clampScalar(0,
S16,
S64)
1013 MinNumMaxNum.customFor(FPTypesBase)
1014 .clampScalar(0,
S32,
S64)
1018 if (ST.hasVOP3PInsts())
1035 .
legalFor(ST.hasPackedFP32Ops(), {V2S32})
1037 if (ST.hasPackedFP32Ops())
1041 if (ST.has16BitInsts()) {
1075 if (ST.hasFractBug()) {
1109 if (ST.hasCvtPkF16F32Inst()) {
1111 .clampMaxNumElements(0,
S16, 2);
1115 FPTruncActions.scalarize(0).lower();
1123 if (ST.has16BitInsts()) {
1143 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1144 FMad.customFor({
S32,
S16});
1145 else if (ST.hasMadMacF32Insts())
1146 FMad.customFor({
S32});
1147 else if (ST.hasMadF16())
1148 FMad.customFor({
S16});
1153 if (ST.has16BitInsts()) {
1156 FRem.minScalar(0,
S32)
1165 .clampMaxNumElements(0,
S16, 2)
1184 if (ST.has16BitInsts())
1195 if (ST.has16BitInsts())
1209 if (ST.has16BitInsts())
1220 .clampScalar(0,
S16,
S64)
1235 .clampScalar(0,
S16,
S64)
1239 if (ST.has16BitInsts()) {
1241 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1243 .clampScalar(0,
S16,
S64)
1247 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1249 .clampScalar(0,
S32,
S64)
1253 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1256 .clampScalar(0,
S32,
S64)
1268 .scalarSameSizeAs(1, 0)
1284 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1285 .legalForCartesianProduct(
1286 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1287 if (ST.has16BitInsts()) {
1288 CmpBuilder.legalFor({{
S1,
S16}});
1299 {
S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1301 if (ST.hasSALUFloatInsts())
1311 if (ST.has16BitInsts())
1312 ExpOps.customFor({{
S32}, {
S16}});
1314 ExpOps.customFor({
S32});
1315 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1323 .
legalFor(ST.has16BitInsts(), {S16})
1329 .
legalFor(ST.has16BitInsts(), {S16})
1343 .clampScalar(0,
S32,
S32)
1350 if (ST.has16BitInsts())
1353 .widenScalarToNextPow2(1)
1359 .lowerFor({
S1,
S16})
1360 .widenScalarToNextPow2(1)
1387 .clampScalar(0,
S32,
S32)
1403 .clampScalar(0,
S32,
S64)
1407 if (ST.has16BitInsts()) {
1410 .clampMaxNumElementsStrict(0,
S16, 2)
1417 if (ST.hasVOP3PInsts()) {
1420 .clampMaxNumElements(0,
S16, 2)
1425 if (ST.hasIntMinMax64()) {
1428 .clampMaxNumElements(0,
S16, 2)
1436 .clampMaxNumElements(0,
S16, 2)
1445 .widenScalarToNextPow2(0)
1473 .legalForCartesianProduct(AddrSpaces32, {
S32})
1489 .legalForCartesianProduct(AddrSpaces32, {
S32})
1506 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1507 bool IsLoad) ->
bool {
1511 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1525 unsigned NumRegs = (MemSize + 31) / 32;
1527 if (!ST.hasDwordx3LoadStores())
1538 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1539 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1540 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1546 for (
unsigned Op : {G_LOAD, G_STORE}) {
1547 const bool IsStore =
Op == G_STORE;
1552 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1555 {
S64, GlobalPtr,
S64, GlobalAlign32},
1558 {
S32, GlobalPtr,
S8, GlobalAlign8},
1559 {
S32, GlobalPtr,
S16, GlobalAlign16},
1561 {
S32, LocalPtr,
S32, 32},
1562 {
S64, LocalPtr,
S64, 32},
1564 {
S32, LocalPtr,
S8, 8},
1565 {
S32, LocalPtr,
S16, 16},
1568 {
S32, PrivatePtr,
S32, 32},
1569 {
S32, PrivatePtr,
S8, 8},
1570 {
S32, PrivatePtr,
S16, 16},
1573 {
S32, ConstantPtr,
S32, GlobalAlign32},
1576 {
S64, ConstantPtr,
S64, GlobalAlign32},
1577 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1586 Actions.unsupportedIf(
1587 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1601 Actions.customIf(
typeIs(1, Constant32Ptr));
1627 return !Query.
Types[0].isVector() &&
1628 needToSplitMemOp(Query,
Op == G_LOAD);
1630 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1635 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1638 if (DstSize > MemSize)
1644 if (MemSize > MaxSize)
1652 return Query.
Types[0].isVector() &&
1653 needToSplitMemOp(Query,
Op == G_LOAD);
1655 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1669 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1670 if (MemSize > MaxSize) {
1674 if (MaxSize % EltSize == 0) {
1680 unsigned NumPieces = MemSize / MaxSize;
1684 if (NumPieces == 1 || NumPieces >= NumElts ||
1685 NumElts % NumPieces != 0)
1686 return std::pair(0, EltTy);
1694 return std::pair(0, EltTy);
1709 return std::pair(0, EltTy);
1714 .widenScalarToNextPow2(0)
1721 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1722 {
S32, GlobalPtr,
S16, 2 * 8},
1723 {
S32, LocalPtr,
S8, 8},
1724 {
S32, LocalPtr,
S16, 16},
1725 {
S32, PrivatePtr,
S8, 8},
1726 {
S32, PrivatePtr,
S16, 16},
1727 {
S32, ConstantPtr,
S8, 8},
1728 {
S32, ConstantPtr,
S16, 2 * 8}})
1734 if (ST.hasFlatAddressSpace()) {
1735 ExtLoads.legalForTypesWithMemDesc(
1736 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1751 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1752 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1753 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1754 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1755 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1756 {
S64, GlobalPtr}, {
S64, LocalPtr},
1757 {
S32, RegionPtr}, {
S64, RegionPtr}});
1758 if (ST.hasFlatAddressSpace()) {
1759 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1764 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr}, {
S32, RegionPtr}});
1765 if (ST.hasFlatAddressSpace()) {
1766 Atomics32.legalFor({{
S32, FlatPtr}});
1771 if (ST.hasLDSFPAtomicAddF32()) {
1772 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1773 if (ST.hasLdsAtomicAddF64())
1774 Atomic.legalFor({{
S64, LocalPtr}});
1775 if (ST.hasAtomicDsPkAdd16Insts())
1776 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1778 if (ST.hasAtomicFaddInsts())
1779 Atomic.legalFor({{
S32, GlobalPtr}});
1780 if (ST.hasFlatAtomicFaddF32Inst())
1781 Atomic.legalFor({{
S32, FlatPtr}});
1783 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1794 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1795 ST.hasAtomicBufferGlobalPkAddF16Insts())
1796 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1797 if (ST.hasAtomicGlobalPkAddBF16Inst())
1798 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1799 if (ST.hasAtomicFlatPkAdd16Insts())
1800 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1805 auto &AtomicFMinFMax =
1807 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1809 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1811 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1812 AtomicFMinFMax.
legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1813 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1815 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1822 {
S32, FlatPtr}, {
S64, FlatPtr}})
1823 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1824 {
S32, RegionPtr}, {
S64, RegionPtr}});
1830 LocalPtr, FlatPtr, PrivatePtr,
1834 .clampScalar(0,
S16,
S64)
1849 if (ST.has16BitInsts()) {
1850 if (ST.hasVOP3PInsts()) {
1852 .clampMaxNumElements(0,
S16, 2);
1854 Shifts.legalFor({{
S16,
S16}});
1857 Shifts.widenScalarIf(
1862 const LLT AmountTy = Query.
Types[1];
1867 Shifts.clampScalar(1,
S32,
S32);
1868 Shifts.widenScalarToNextPow2(0, 16);
1869 Shifts.clampScalar(0,
S16,
S64);
1879 Shifts.clampScalar(1,
S32,
S32);
1880 Shifts.widenScalarToNextPow2(0, 32);
1881 Shifts.clampScalar(0,
S32,
S64);
1890 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1891 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1892 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1893 unsigned IdxTypeIdx = 2;
1897 const LLT EltTy = Query.
Types[EltTypeIdx];
1898 const LLT VecTy = Query.
Types[VecTypeIdx];
1899 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1901 const bool isLegalVecType =
1911 return (EltSize == 32 || EltSize == 64) &&
1927 const LLT EltTy = Query.
Types[EltTypeIdx];
1928 const LLT VecTy = Query.
Types[VecTypeIdx];
1932 const unsigned TargetEltSize =
1933 DstEltSize % 64 == 0 ? 64 : 32;
1934 return std::pair(VecTypeIdx,
1938 .clampScalar(EltTypeIdx,
S32,
S64)
1952 const LLT &EltTy = Query.
Types[1].getElementType();
1953 return Query.
Types[0] != EltTy;
1956 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1957 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1958 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1962 const LLT BigTy = Query.
Types[BigTyIdx];
1968 const LLT LitTy = Query.
Types[LitTyIdx];
1981 const LLT BigTy = Query.
Types[BigTyIdx];
1982 const LLT LitTy = Query.
Types[LitTyIdx];
1999 if (ST.hasScalarPackInsts()) {
2002 .minScalarOrElt(0,
S16)
2009 BuildVector.customFor({
V2S16,
S16});
2010 BuildVector.minScalarOrElt(0,
S32);
2029 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2030 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
2031 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
2033 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
2034 const LLT Ty = Query.
Types[TypeIdx];
2035 if (Ty.isVector()) {
2050 const LLT BigTy = Query.
Types[BigTyIdx];
2070 return notValidElt(Query, LitTyIdx);
2075 return notValidElt(Query, BigTyIdx);
2080 if (
Op == G_MERGE_VALUES) {
2081 Builder.widenScalarIf(
2084 const LLT Ty = Query.
Types[LitTyIdx];
2085 return Ty.getSizeInBits() < 32;
2092 const LLT Ty = Query.
Types[BigTyIdx];
2093 return Ty.getSizeInBits() % 16 != 0;
2098 const LLT &Ty = Query.
Types[BigTyIdx];
2099 unsigned NewSizeInBits = 1 <<
Log2_32_Ceil(Ty.getSizeInBits() + 1);
2100 if (NewSizeInBits >= 256) {
2101 unsigned RoundedTo =
alignTo<64>(Ty.getSizeInBits() + 1);
2102 if (RoundedTo < NewSizeInBits)
2103 NewSizeInBits = RoundedTo;
2105 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2116 .clampScalar(0,
S32,
S64);
2118 if (ST.hasVOP3PInsts()) {
2119 SextInReg.lowerFor({{
V2S16}})
2123 .clampMaxNumElementsStrict(0,
S16, 2);
2124 }
else if (ST.has16BitInsts()) {
2125 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2129 SextInReg.lowerFor({{
S32}, {
S64}});
2142 FSHRActionDefs.legalFor({{
S32,
S32}})
2143 .clampMaxNumElementsStrict(0,
S16, 2);
2144 if (ST.hasVOP3PInsts())
2146 FSHRActionDefs.scalarize(0).lower();
2148 if (ST.hasVOP3PInsts()) {
2151 .clampMaxNumElementsStrict(0,
S16, 2)
2175 .clampScalar(1,
S32,
S32)
2184 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2185 G_READ_REGISTER, G_WRITE_REGISTER,
2190 if (ST.hasIEEEMinimumMaximumInsts()) {
2192 .legalFor(FPTypesPK16)
2195 }
else if (ST.hasVOP3PInsts()) {
2198 .clampMaxNumElementsStrict(0,
S16, 2)
2214 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2215 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2221 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2222 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2223 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2224 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2230 verify(*ST.getInstrInfo());
2239 switch (
MI.getOpcode()) {
2240 case TargetOpcode::G_ADDRSPACE_CAST:
2242 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2244 case TargetOpcode::G_FCEIL:
2246 case TargetOpcode::G_FREM:
2248 case TargetOpcode::G_INTRINSIC_TRUNC:
2250 case TargetOpcode::G_SITOFP:
2252 case TargetOpcode::G_UITOFP:
2254 case TargetOpcode::G_FPTOSI:
2256 case TargetOpcode::G_FPTOUI:
2258 case TargetOpcode::G_FMINNUM:
2259 case TargetOpcode::G_FMAXNUM:
2260 case TargetOpcode::G_FMINIMUMNUM:
2261 case TargetOpcode::G_FMAXIMUMNUM:
2263 case TargetOpcode::G_EXTRACT:
2265 case TargetOpcode::G_INSERT:
2267 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2269 case TargetOpcode::G_INSERT_VECTOR_ELT:
2271 case TargetOpcode::G_FSIN:
2272 case TargetOpcode::G_FCOS:
2274 case TargetOpcode::G_GLOBAL_VALUE:
2276 case TargetOpcode::G_LOAD:
2277 case TargetOpcode::G_SEXTLOAD:
2278 case TargetOpcode::G_ZEXTLOAD:
2280 case TargetOpcode::G_STORE:
2282 case TargetOpcode::G_FMAD:
2284 case TargetOpcode::G_FDIV:
2286 case TargetOpcode::G_FFREXP:
2288 case TargetOpcode::G_FSQRT:
2290 case TargetOpcode::G_UDIV:
2291 case TargetOpcode::G_UREM:
2292 case TargetOpcode::G_UDIVREM:
2294 case TargetOpcode::G_SDIV:
2295 case TargetOpcode::G_SREM:
2296 case TargetOpcode::G_SDIVREM:
2298 case TargetOpcode::G_ATOMIC_CMPXCHG:
2300 case TargetOpcode::G_FLOG2:
2302 case TargetOpcode::G_FLOG:
2303 case TargetOpcode::G_FLOG10:
2305 case TargetOpcode::G_FEXP2:
2307 case TargetOpcode::G_FEXP:
2308 case TargetOpcode::G_FEXP10:
2310 case TargetOpcode::G_FPOW:
2312 case TargetOpcode::G_FFLOOR:
2314 case TargetOpcode::G_BUILD_VECTOR:
2315 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2317 case TargetOpcode::G_MUL:
2319 case TargetOpcode::G_CTLZ:
2320 case TargetOpcode::G_CTTZ:
2322 case TargetOpcode::G_CTLS:
2324 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2326 case TargetOpcode::G_STACKSAVE:
2328 case TargetOpcode::G_GET_FPENV:
2330 case TargetOpcode::G_SET_FPENV:
2332 case TargetOpcode::G_TRAP:
2334 case TargetOpcode::G_DEBUGTRAP:
2354 if (ST.hasApertureRegs()) {
2359 ? AMDGPU::SRC_SHARED_BASE
2360 : AMDGPU::SRC_PRIVATE_BASE;
2361 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2362 !ST.hasGloballyAddressableScratch()) &&
2363 "Cannot use src_private_base with globally addressable scratch!");
2366 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2367 return B.buildUnmerge(
S32, Dst).getReg(1);
2382 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2398 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2401 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2423 B.buildObjectPtrOffset(
2425 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2426 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2434 switch (Def->getOpcode()) {
2435 case AMDGPU::G_FRAME_INDEX:
2436 case AMDGPU::G_GLOBAL_VALUE:
2437 case AMDGPU::G_BLOCK_ADDR:
2439 case AMDGPU::G_CONSTANT: {
2440 const ConstantInt *CI = Def->getOperand(1).getCImm();
2457 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2459 Intrinsic::amdgcn_addrspacecast_nonnull));
2464 :
MI.getOperand(1).getReg();
2468 unsigned SrcAS = SrcTy.getAddressSpace();
2478 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2485 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2487 ST.hasGloballyAddressableScratch()) {
2491 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2493 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2494 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2496 MRI.
setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2498 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2502 return B.buildExtract(Dst, Src, 0).getReg(0);
2508 castFlatToLocalOrPrivate(Dst);
2509 MI.eraseFromParent();
2515 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2516 auto FlatNull =
B.buildConstant(SrcTy, 0);
2519 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2523 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2525 MI.eraseFromParent();
2532 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2535 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2538 ST.hasGloballyAddressableScratch()) {
2543 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2547 if (ST.isWave64()) {
2548 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2554 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2555 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2557 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2561 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2562 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2564 MRI.
setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2565 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2574 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2580 castLocalOrPrivateToFlat(Dst);
2581 MI.eraseFromParent();
2585 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2592 SegmentNull.getReg(0));
2594 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2596 MI.eraseFromParent();
2601 SrcTy.getSizeInBits() == 64) {
2603 B.buildExtract(Dst, Src, 0);
2604 MI.eraseFromParent();
2611 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2612 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2613 if (AddrHiVal == 0) {
2615 B.buildIntToPtr(Dst, Zext);
2617 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2618 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2621 MI.eraseFromParent();
2628 MI.eraseFromParent();
2637 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2642 auto C1 =
B.buildFConstant(Ty, C1Val);
2643 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2646 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2647 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2649 auto C2 =
B.buildFConstant(Ty, C2Val);
2650 auto Fabs =
B.buildFAbs(Ty, Src);
2653 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2654 MI.eraseFromParent();
2672 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2674 const auto Zero =
B.buildFConstant(
S64, 0.0);
2675 const auto One =
B.buildFConstant(
S64, 1.0);
2678 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2679 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2682 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2683 MI.eraseFromParent();
2691 Register Src0Reg =
MI.getOperand(1).getReg();
2692 Register Src1Reg =
MI.getOperand(2).getReg();
2693 auto Flags =
MI.getFlags();
2696 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2697 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2698 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2699 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2700 MI.eraseFromParent();
2706 const unsigned FractBits = 52;
2707 const unsigned ExpBits = 11;
2710 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2711 auto Const1 =
B.buildConstant(
S32, ExpBits);
2713 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2715 .addUse(Const0.getReg(0))
2716 .addUse(Const1.getReg(0));
2718 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2732 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2739 const unsigned FractBits = 52;
2742 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2743 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2745 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2747 const auto Zero32 =
B.buildConstant(
S32, 0);
2750 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2752 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2753 auto Not =
B.buildNot(
S64, Shr);
2754 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2755 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2760 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2761 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2762 MI.eraseFromParent();
2778 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2779 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2782 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2783 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2785 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2786 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2789 B.buildFAdd(Dst, LdExp, CvtLo);
2790 MI.eraseFromParent();
2796 auto One =
B.buildConstant(
S32, 1);
2800 auto ThirtyOne =
B.buildConstant(
S32, 31);
2801 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2802 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2803 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2804 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2805 .addUse(Unmerge.getReg(1));
2806 auto LS2 =
B.buildSub(
S32, LS, One);
2807 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2809 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2810 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2811 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2812 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2813 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2814 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2815 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2816 B.buildFLdexp(Dst, FVal, Scale);
2817 MI.eraseFromParent();
2837 unsigned Flags =
MI.getFlags();
2848 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2856 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2857 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2861 K0 =
B.buildFConstant(
2863 K1 =
B.buildFConstant(
2866 K0 =
B.buildFConstant(
2868 K1 =
B.buildFConstant(
2872 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2873 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2874 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2877 :
B.buildFPTOUI(
S32, FloorMul);
2878 auto Lo =
B.buildFPTOUI(
S32, Fma);
2882 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2884 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2887 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2888 MI.eraseFromParent();
2920 unsigned StartIdx =
Offset / 32;
2922 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32), SrcReg);
2924 if (DstCount == 1) {
2926 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2931 for (
unsigned I = 0;
I < DstCount; ++
I)
2932 MergeVec.
push_back(Unmerge.getReg(StartIdx +
I));
2933 B.buildMergeLikeInstr(DstReg, MergeVec);
2936 MI.eraseFromParent();
2946 Register InsertSrc =
MI.getOperand(2).getReg();
2955 if (
Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2959 unsigned DstCount = DstSize / 32;
2960 unsigned InsertCount = InsertSize / 32;
2961 unsigned StartIdx =
Offset / 32;
2963 auto SrcUnmerge =
B.buildUnmerge(
S32, SrcReg);
2966 for (
unsigned I = 0;
I < StartIdx; ++
I)
2969 if (InsertCount == 1) {
2973 InsertSrc =
B.buildPtrToInt(
S32, InsertSrc).getReg(0);
2976 auto InsertUnmerge =
B.buildUnmerge(
S32, InsertSrc);
2977 for (
unsigned I = 0;
I < InsertCount; ++
I)
2981 for (
unsigned I = StartIdx + InsertCount;
I < DstCount; ++
I)
2984 B.buildMergeLikeInstr(DstReg, MergeVec);
2986 MI.eraseFromParent();
3013 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
3014 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
3015 B.buildIntToPtr(Dst, IntElt);
3017 MI.eraseFromParent();
3024 std::optional<ValueAndVReg> MaybeIdxVal =
3028 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3031 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
3032 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3037 MI.eraseFromParent();
3066 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
3067 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
3068 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3070 B.buildIntToPtr(Dst, IntVecDest);
3071 MI.eraseFromParent();
3078 std::optional<ValueAndVReg> MaybeIdxVal =
3083 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3086 if (IdxVal < NumElts) {
3088 for (
unsigned i = 0; i < NumElts; ++i)
3090 B.buildUnmerge(SrcRegs, Vec);
3092 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
3093 B.buildMergeLikeInstr(Dst, SrcRegs);
3098 MI.eraseFromParent();
3109 unsigned Flags =
MI.getFlags();
3113 if (ST.hasTrigReducedRange()) {
3114 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3115 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3116 .addUse(MulVal.getReg(0))
3120 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3123 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3127 MI.eraseFromParent();
3135 unsigned GAFlags)
const {
3164 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3166 if (ST.has64BitLiterals()) {
3170 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3174 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3183 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3184 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3187 B.buildExtract(DstReg, PCReg, 0);
3197 if (RequiresHighHalf && ST.has64BitLiterals()) {
3199 MRI.
setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3200 B.buildInstr(AMDGPU::S_MOV_B64)
3215 MRI.
setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3218 B.buildInstr(AMDGPU::S_MOV_B32)
3223 if (RequiresHighHalf) {
3225 "Must provide a 64-bit pointer type!");
3228 MRI.
setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3230 B.buildInstr(AMDGPU::S_MOV_B32)
3241 MRI.
setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3243 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3247 if (AddrDst != DstReg)
3248 B.buildCast(DstReg, AddrDst);
3249 }
else if (AddrLo != DstReg) {
3252 B.buildCast(DstReg, AddrLo);
3269 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3273 Fn,
"local memory global used by non-kernel function",
3282 B.buildUndef(DstReg);
3283 MI.eraseFromParent();
3307 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3308 B.buildIntToPtr(DstReg, Sz);
3309 MI.eraseFromParent();
3315 MI.eraseFromParent();
3319 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3321 MI.eraseFromParent();
3329 MI.eraseFromParent();
3335 MI.eraseFromParent();
3351 if (Ty.getSizeInBits() == 32) {
3353 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3354 B.buildExtract(DstReg, Load, 0);
3356 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3358 MI.eraseFromParent();
3381 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3383 MI.getOperand(1).setReg(Cast.getReg(0));
3388 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3414 if (WideMemSize == ValSize) {
3420 MI.setMemRefs(MF, {WideMMO});
3426 if (ValSize > WideMemSize)
3433 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3434 B.buildTrunc(ValReg, WideLoad).getReg(0);
3441 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3442 B.buildExtract(ValReg, WideLoad, 0);
3446 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3447 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3451 MI.eraseFromParent();
3464 Register DataReg =
MI.getOperand(0).getReg();
3509 "this should not have been custom lowered");
3514 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3516 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3520 .setMemRefs(
MI.memoperands());
3522 MI.eraseFromParent();
3530 switch (
DefMI->getOpcode()) {
3531 case TargetOpcode::G_INTRINSIC: {
3533 case Intrinsic::amdgcn_frexp_mant:
3534 case Intrinsic::amdgcn_log:
3535 case Intrinsic::amdgcn_log_clamp:
3536 case Intrinsic::amdgcn_exp2:
3537 case Intrinsic::amdgcn_sqrt:
3545 case TargetOpcode::G_FSQRT:
3547 case TargetOpcode::G_FFREXP: {
3548 if (
DefMI->getOperand(0).getReg() == Src)
3552 case TargetOpcode::G_FPEXT: {
3573std::pair<Register, Register>
3575 unsigned Flags)
const {
3580 auto SmallestNormal =
B.buildFConstant(
3582 auto IsLtSmallestNormal =
3585 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3586 auto One =
B.buildFConstant(
F32, 1.0);
3588 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3589 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3591 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3604 LLT Ty =
B.getMRI()->getType(Dst);
3605 unsigned Flags =
MI.getFlags();
3610 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3611 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3612 .addUse(Ext.getReg(0))
3614 B.buildFPTrunc(Dst,
Log2, Flags);
3615 MI.eraseFromParent();
3623 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3626 MI.eraseFromParent();
3630 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3631 .addUse(ScaledInput)
3634 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3635 auto Zero =
B.buildFConstant(Ty, 0.0);
3637 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3638 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3640 MI.eraseFromParent();
3646 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3647 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3652 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3653 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3658 unsigned Flags =
MI.getFlags();
3671 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3673 B.buildFPTrunc(Dst, LogVal);
3678 MI.eraseFromParent();
3687 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3690 if (ST.hasFastFMAF32()) {
3692 const float c_log10 = 0x1.344134p-2f;
3693 const float cc_log10 = 0x1.09f79ep-26f;
3696 const float c_log = 0x1.62e42ep-1f;
3697 const float cc_log = 0x1.efa39ep-25f;
3699 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3700 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3704 R =
B.buildFMul(Ty,
Y,
C, NewFlags).getReg(0);
3705 auto NegR =
B.buildFNeg(Ty, R, NewFlags);
3706 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, NewFlags);
3707 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, NewFlags);
3708 R =
B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3711 const float ch_log10 = 0x1.344000p-2f;
3712 const float ct_log10 = 0x1.3509f6p-18f;
3715 const float ch_log = 0x1.62e000p-1f;
3716 const float ct_log = 0x1.0bfbe8p-15f;
3718 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3719 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3721 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3722 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3723 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3727 auto YTCT =
B.buildFMul(Ty, YT, CT, NewFlags);
3730 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3732 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, NewFlags);
3735 const bool IsFiniteOnly =
3738 if (!IsFiniteOnly) {
3741 auto Fabs =
B.buildFAbs(Ty,
Y);
3744 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3748 auto Zero =
B.buildFConstant(Ty, 0.0);
3750 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3751 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3752 B.buildFSub(Dst, R, Shift, Flags);
3754 B.buildCopy(Dst, R);
3757 MI.eraseFromParent();
3763 unsigned Flags)
const {
3764 const double Log2BaseInverted =
3767 LLT Ty =
B.getMRI()->getType(Dst);
3772 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3775 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3776 auto Zero =
B.buildFConstant(Ty, 0.0);
3778 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3779 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3781 if (ST.hasFastFMAF32())
3782 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3784 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3785 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3793 ?
B.buildFLog2(Ty, Src, Flags)
3794 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3797 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3798 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3809 unsigned Flags =
MI.getFlags();
3810 LLT Ty =
B.getMRI()->getType(Dst);
3820 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3821 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3822 .addUse(Ext.getReg(0))
3824 B.buildFPTrunc(Dst,
Log2, Flags);
3825 MI.eraseFromParent();
3835 MI.eraseFromParent();
3843 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3845 RangeCheckConst, Flags);
3847 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3848 auto Zero =
B.buildFConstant(Ty, 0.0);
3849 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3850 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3852 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3853 .addUse(AddInput.getReg(0))
3856 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3857 auto One =
B.buildFConstant(Ty, 1.0);
3858 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3859 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3860 MI.eraseFromParent();
3865 const SrcOp &Src,
unsigned Flags) {
3866 LLT Ty = Dst.getLLTTy(*
B.getMRI());
3869 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3870 .addUse(Src.getReg())
3873 return B.buildFExp2(Dst, Src, Flags);
3879 bool IsExp10)
const {
3880 LLT Ty =
B.getMRI()->getType(
X);
3884 auto Const =
B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f :
numbers::log2e);
3885 auto Mul =
B.buildFMul(Ty,
X, Const, Flags);
3892 LLT Ty =
B.getMRI()->getType(Dst);
3899 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3902 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3903 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3904 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3907 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3909 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3910 .addUse(ExpInput.getReg(0))
3913 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3914 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3915 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3921 unsigned Flags)
const {
3922 LLT Ty =
B.getMRI()->getType(Dst);
3927 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3928 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3930 auto Mul1 =
B.buildFMul(Ty,
X, K1, Flags);
3931 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3932 auto Mul0 =
B.buildFMul(Ty,
X, K0, Flags);
3933 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3934 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3944 auto Threshold =
B.buildFConstant(Ty, -0x1.2f7030p+5f);
3948 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+5f);
3949 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3950 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X);
3952 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3953 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3955 auto Mul1 =
B.buildFMul(Ty, AdjustedX, K1, Flags);
3956 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3957 auto Mul0 =
B.buildFMul(Ty, AdjustedX, K0, Flags);
3958 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3960 auto MulExps =
B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3961 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.9f623ep-107f);
3962 auto AdjustedResult =
B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3964 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3983 if (
MI.getOpcode() == TargetOpcode::G_FEXP2) {
3985 Dn =
B.buildFRint(
S64,
X, Flags).getReg(0);
3987 F =
B.buildFSub(
S64,
X, Dn, Flags).getReg(0);
3989 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
3990 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
3991 auto Mul2 =
B.buildFMul(
S64,
F, C2, Flags).getReg(0);
3992 T =
B.buildFMA(
S64,
F, C1, Mul2, Flags).getReg(0);
3994 }
else if (
MI.getOpcode() == TargetOpcode::G_FEXP10) {
3995 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.a934f0979a371p+1));
3996 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
3997 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
3999 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4000 auto C2 =
B.buildFConstant(
S64,
APFloat(-0x1.9dc1da994fd21p-59));
4001 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.34413509f79ffp-2));
4002 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4003 F =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4005 auto C4 =
B.buildFConstant(
S64,
APFloat(0x1.26bb1bbb55516p+1));
4006 auto C5 =
B.buildFConstant(
S64,
APFloat(-0x1.f48ad494ea3e9p-53));
4007 auto MulF =
B.buildFMul(
S64,
F, C5, Flags).getReg(0);
4008 T =
B.buildFMA(
S64,
F, C4, MulF, Flags).getReg(0);
4011 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.71547652b82fep+0));
4012 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4013 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4015 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4016 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4017 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4018 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4019 T =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4023 auto P =
B.buildFConstant(
S64, 0x1.ade156a5dcb37p-26);
4024 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.28af3fca7ab0cp-22),
4026 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.71dee623fde64p-19),
4028 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01997c89e6b0p-16),
4030 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01a014761f6ep-13),
4032 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.6c16c1852b7b0p-10),
4034 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.1111111122322p-7), Flags);
4035 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.55555555502a1p-5), Flags);
4036 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.5555555555511p-3), Flags);
4037 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.000000000000bp-1), Flags);
4039 auto One =
B.buildFConstant(
S64, 1.0);
4040 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4041 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4044 auto DnInt =
B.buildFPTOSI(
S32, Dn);
4045 auto Z =
B.buildFLdexp(
S64,
P, DnInt, Flags);
4052 Z =
B.buildSelect(
S64, CondHi, Z, PInf, Flags);
4059 B.buildSelect(
MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4061 MI.eraseFromParent();
4069 const unsigned Flags =
MI.getFlags();
4081 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
4089 MI.eraseFromParent();
4100 auto Ext =
B.buildFPExt(
F32,
X, Flags);
4103 B.buildFPTrunc(Dst, Lowered, Flags);
4104 MI.eraseFromParent();
4115 MI.eraseFromParent();
4143 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
4146 if (ST.hasFastFMAF32()) {
4148 const float cc_exp = 0x1.4ae0bep-26f;
4149 const float c_exp10 = 0x1.a934f0p+1f;
4150 const float cc_exp10 = 0x1.2f346ep-24f;
4152 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4153 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
4154 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
4155 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
4157 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4158 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
4160 const float ch_exp = 0x1.714000p+0f;
4161 const float cl_exp = 0x1.47652ap-12f;
4163 const float ch_exp10 = 0x1.a92000p+1f;
4164 const float cl_exp10 = 0x1.4f0978p-11f;
4166 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
4167 auto XH =
B.buildAnd(Ty,
X, MaskConst);
4168 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
4170 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4171 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
4173 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4174 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
4177 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
4178 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4181 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
4184 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
4185 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
4188 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4189 .addUse(
A.getReg(0))
4191 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
4193 auto UnderflowCheckConst =
4194 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4195 auto Zero =
B.buildFConstant(Ty, 0.0);
4199 R =
B.buildSelect(Ty, Underflow, Zero, R);
4202 auto OverflowCheckConst =
4203 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4208 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
4211 B.buildCopy(Dst, R);
4212 MI.eraseFromParent();
4221 unsigned Flags =
MI.getFlags();
4222 LLT Ty =
B.getMRI()->getType(Dst);
4227 auto Log =
B.buildFLog2(
F32, Src0, Flags);
4228 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4229 .addUse(Log.getReg(0))
4232 B.buildFExp2(Dst,
Mul, Flags);
4233 }
else if (Ty == F16) {
4235 auto Log =
B.buildFLog2(F16, Src0, Flags);
4236 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
4237 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
4238 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4239 .addUse(Ext0.getReg(0))
4240 .addUse(Ext1.getReg(0))
4242 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
4246 MI.eraseFromParent();
4254 ModSrc = SrcFNeg->getOperand(1).getReg();
4256 ModSrc = SrcFAbs->getOperand(1).getReg();
4258 ModSrc = SrcFAbs->getOperand(1).getReg();
4269 Register OrigSrc =
MI.getOperand(1).getReg();
4270 unsigned Flags =
MI.getFlags();
4272 "this should not have been custom lowered");
4282 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
4302 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4304 B.buildFMinNum(Min, Fract, Const, Flags);
4309 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
4312 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
4313 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4315 MI.eraseFromParent();
4331 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4333 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4334 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4337 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4338 B.buildBitcast(Dst,
Merge);
4340 MI.eraseFromParent();
4357 bool UsePartialMad64_32,
4358 bool SeparateOddAlignedProducts)
const {
4373 auto getZero32 = [&]() ->
Register {
4375 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4378 auto getZero64 = [&]() ->
Register {
4380 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4385 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4396 if (CarryIn.empty())
4399 bool HaveCarryOut =
true;
4401 if (CarryIn.size() == 1) {
4403 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4407 CarryAccum = getZero32();
4409 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4410 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4412 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4417 LocalAccum = getZero32();
4418 HaveCarryOut =
false;
4423 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4424 LocalAccum =
Add.getReg(0);
4438 auto buildMadChain =
4441 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4442 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4449 if (LocalAccum.size() == 1 &&
4450 (!UsePartialMad64_32 || !CarryIn.empty())) {
4453 unsigned j1 = DstIndex - j0;
4454 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4458 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4460 LocalAccum[0] =
Mul.getReg(0);
4462 if (CarryIn.empty()) {
4463 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4466 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4472 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4476 if (j0 <= DstIndex) {
4477 bool HaveSmallAccum =
false;
4480 if (LocalAccum[0]) {
4481 if (LocalAccum.size() == 1) {
4482 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4483 HaveSmallAccum =
true;
4484 }
else if (LocalAccum[1]) {
4485 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4486 HaveSmallAccum =
false;
4488 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4489 HaveSmallAccum =
true;
4492 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4494 HaveSmallAccum =
true;
4498 unsigned j1 = DstIndex - j0;
4499 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4503 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4504 {Src0[j0], Src1[j1], Tmp});
4505 Tmp = Mad.getReg(0);
4506 if (!HaveSmallAccum)
4507 CarryOut.push_back(Mad.getReg(1));
4508 HaveSmallAccum =
false;
4511 }
while (j0 <= DstIndex);
4513 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4514 LocalAccum[0] = Unmerge.getReg(0);
4515 if (LocalAccum.size() > 1)
4516 LocalAccum[1] = Unmerge.getReg(1);
4543 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4544 Carry OddCarryIn = std::move(OddCarry);
4545 Carry EvenCarryIn = std::move(EvenCarry);
4550 if (2 * i < Accum.
size()) {
4551 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4552 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4557 if (!SeparateOddAlignedProducts) {
4558 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4559 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4561 bool IsHighest = 2 * i >= Accum.
size();
4564 .take_front(IsHighest ? 1 : 2);
4565 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4571 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4573 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4575 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4578 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4581 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4582 Lo->getOperand(1).getReg());
4583 Accum[2 * i] =
Hi.getReg(0);
4584 SeparateOddCarry =
Hi.getReg(1);
4591 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4592 EvenCarryIn.push_back(CarryOut);
4594 if (2 * i < Accum.
size()) {
4595 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4596 OddCarry.push_back(CarryOut);
4608 assert(ST.hasMad64_32());
4609 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4621 unsigned Size = Ty.getSizeInBits();
4622 if (ST.hasVectorMulU64() &&
Size == 64)
4625 unsigned NumParts =
Size / 32;
4637 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4641 for (
unsigned i = 0; i < NumParts; ++i) {
4645 B.buildUnmerge(Src0Parts, Src0);
4646 B.buildUnmerge(Src1Parts, Src1);
4649 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4650 SeparateOddAlignedProducts);
4652 B.buildMergeLikeInstr(DstReg, AccumRegs);
4653 MI.eraseFromParent();
4668 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4669 ? AMDGPU::G_AMDGPU_FFBH_U32
4670 : AMDGPU::G_AMDGPU_FFBL_B32;
4671 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4674 MI.eraseFromParent();
4684 TypeSize NumBits = SrcTy.getSizeInBits();
4688 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4689 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4690 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4691 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4692 B.buildTrunc(Dst, Ctlz);
4693 MI.eraseFromParent();
4704 assert(SrcTy ==
S32 &&
"legalizeCTLS only supports s32");
4705 unsigned BitWidth = SrcTy.getSizeInBits();
4707 auto Sffbh =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32}).addUse(Src);
4709 B.buildSub(Dst, Clamped,
B.buildConstant(
S32, 1));
4710 MI.eraseFromParent();
4716 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4719 return ConstVal == -1;
4726 Register CondDef =
MI.getOperand(0).getReg();
4745 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4754 UncondBrTarget = &*NextMBB;
4756 if (
Next->getOpcode() != AMDGPU::G_BR)
4775 *ArgRC,
B.getDebugLoc(), ArgTy);
4779 const unsigned Mask = Arg->
getMask();
4787 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4788 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4791 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4793 B.buildCopy(DstReg, LiveIn);
4803 if (!ST.hasClusters()) {
4806 MI.eraseFromParent();
4826 auto One =
B.buildConstant(
S32, 1);
4827 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4828 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4829 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4836 B.buildCopy(DstReg, GlobalIdXYZ);
4837 MI.eraseFromParent();
4841 B.buildCopy(DstReg, ClusterIdXYZ);
4842 MI.eraseFromParent();
4847 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4849 MRI.
setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4850 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4852 .addImm(ClusterIdField);
4853 auto Zero =
B.buildConstant(
S32, 0);
4856 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4857 MI.eraseFromParent();
4899 auto LoadConstant = [&](
unsigned N) {
4900 B.buildConstant(DstReg,
N);
4904 if (ST.hasArchitectedSGPRs() &&
4911 Arg = &WorkGroupIDX;
4912 ArgRC = &AMDGPU::SReg_32RegClass;
4916 Arg = &WorkGroupIDY;
4917 ArgRC = &AMDGPU::SReg_32RegClass;
4921 Arg = &WorkGroupIDZ;
4922 ArgRC = &AMDGPU::SReg_32RegClass;
4926 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4927 return LoadConstant(0);
4928 Arg = &ClusterWorkGroupIDX;
4929 ArgRC = &AMDGPU::SReg_32RegClass;
4933 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
4934 return LoadConstant(0);
4935 Arg = &ClusterWorkGroupIDY;
4936 ArgRC = &AMDGPU::SReg_32RegClass;
4940 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
4941 return LoadConstant(0);
4942 Arg = &ClusterWorkGroupIDZ;
4943 ArgRC = &AMDGPU::SReg_32RegClass;
4948 return LoadConstant(ClusterDims.
getDims()[0] - 1);
4949 Arg = &ClusterWorkGroupMaxIDX;
4950 ArgRC = &AMDGPU::SReg_32RegClass;
4955 return LoadConstant(ClusterDims.
getDims()[1] - 1);
4956 Arg = &ClusterWorkGroupMaxIDY;
4957 ArgRC = &AMDGPU::SReg_32RegClass;
4962 return LoadConstant(ClusterDims.
getDims()[2] - 1);
4963 Arg = &ClusterWorkGroupMaxIDZ;
4964 ArgRC = &AMDGPU::SReg_32RegClass;
4968 Arg = &ClusterWorkGroupMaxFlatID;
4969 ArgRC = &AMDGPU::SReg_32RegClass;
4984 return LoadConstant(0);
4989 B.buildUndef(DstReg);
4993 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5005 MI.eraseFromParent();
5011 B.buildConstant(
MI.getOperand(0).getReg(),
C);
5012 MI.eraseFromParent();
5019 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
5033 B.buildUndef(DstReg);
5034 MI.eraseFromParent();
5038 if (Arg->isMasked()) {
5052 MI.eraseFromParent();
5067 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
5076 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5084 Align Alignment)
const {
5088 "unexpected kernarg parameter type");
5095 MI.eraseFromParent();
5130 auto FloatY =
B.buildUITOFP(
S32,
Y);
5131 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
5133 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
5134 auto Z =
B.buildFPTOUI(
S32, ScaledY);
5137 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
5138 auto NegYZ =
B.buildMul(
S32, NegY, Z);
5139 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
5142 auto Q =
B.buildUMulH(
S32,
X, Z);
5143 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
5146 auto One =
B.buildConstant(
S32, 1);
5149 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
5155 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
5158 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
5177 auto Unmerge =
B.buildUnmerge(
S32, Val);
5179 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
5180 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
5182 auto Mad =
B.buildFMAD(
5186 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
5187 auto Mul1 =
B.buildFMul(
5191 auto Mul2 =
B.buildFMul(
5193 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
5196 auto Mad2 =
B.buildFMAD(
5200 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
5201 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
5203 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5218 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
5220 auto Zero64 =
B.buildConstant(
S64, 0);
5221 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
5223 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
5224 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
5226 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
5227 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5228 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5230 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
5231 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5232 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
5234 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
5235 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
5236 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
5237 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5238 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5240 auto Zero32 =
B.buildConstant(
S32, 0);
5241 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
5242 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5243 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
5245 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
5246 Register NumerLo = UnmergeNumer.getReg(0);
5247 Register NumerHi = UnmergeNumer.getReg(1);
5249 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
5250 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
5251 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
5252 Register Mul3_Lo = UnmergeMul3.getReg(0);
5253 Register Mul3_Hi = UnmergeMul3.getReg(1);
5254 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
5255 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5256 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
5257 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
5259 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
5260 Register DenomLo = UnmergeDenom.getReg(0);
5261 Register DenomHi = UnmergeDenom.getReg(1);
5264 auto C1 =
B.buildSExt(
S32, CmpHi);
5267 auto C2 =
B.buildSExt(
S32, CmpLo);
5270 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
5277 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
5278 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5279 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5280 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
5282 auto One64 =
B.buildConstant(
S64, 1);
5283 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
5289 auto C6 =
B.buildSelect(
5293 auto Add4 =
B.buildAdd(
S64, Add3, One64);
5294 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
5296 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5297 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5298 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
5304 auto Sel1 =
B.buildSelect(
5311 auto Sel2 =
B.buildSelect(
5322 switch (
MI.getOpcode()) {
5325 case AMDGPU::G_UDIV: {
5326 DstDivReg =
MI.getOperand(0).getReg();
5329 case AMDGPU::G_UREM: {
5330 DstRemReg =
MI.getOperand(0).getReg();
5333 case AMDGPU::G_UDIVREM: {
5334 DstDivReg =
MI.getOperand(0).getReg();
5335 DstRemReg =
MI.getOperand(1).getReg();
5342 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5343 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
5344 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5354 MI.eraseFromParent();
5365 if (Ty !=
S32 && Ty !=
S64)
5368 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5369 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5370 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5372 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5373 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5374 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5376 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5377 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5379 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5380 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5382 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5383 switch (
MI.getOpcode()) {
5386 case AMDGPU::G_SDIV: {
5387 DstDivReg =
MI.getOperand(0).getReg();
5391 case AMDGPU::G_SREM: {
5392 DstRemReg =
MI.getOperand(0).getReg();
5396 case AMDGPU::G_SDIVREM: {
5397 DstDivReg =
MI.getOperand(0).getReg();
5398 DstRemReg =
MI.getOperand(1).getReg();
5411 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5412 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5413 B.buildSub(DstDivReg, SignXor, Sign);
5417 auto Sign = LHSign.getReg(0);
5418 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5419 B.buildSub(DstRemReg, SignXor, Sign);
5422 MI.eraseFromParent();
5438 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5449 if (CLHS->isExactlyValue(1.0)) {
5450 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5454 MI.eraseFromParent();
5459 if (CLHS->isExactlyValue(-1.0)) {
5460 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5461 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5462 .addUse(FNeg.getReg(0))
5465 MI.eraseFromParent();
5472 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5477 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5480 B.buildFMul(Res, LHS, RCP, Flags);
5482 MI.eraseFromParent();
5497 if (!AllowInaccurateRcp)
5505 X =
B.buildFConstant(ResTy, 1.0).getReg(0);
5507 Register NegY = IsNegRcp ?
Y :
B.buildFNeg(ResTy,
Y).getReg(0);
5508 auto One =
B.buildFConstant(ResTy, 1.0);
5510 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5514 R =
B.buildFNeg(ResTy, R);
5516 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5517 R =
B.buildFMA(ResTy, Tmp0, R, R);
5519 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5520 R =
B.buildFMA(ResTy, Tmp1, R, R);
5524 B.buildCopy(Res, R);
5525 MI.eraseFromParent();
5529 auto Ret =
B.buildFMul(ResTy,
X, R);
5530 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5532 B.buildFMA(Res, Tmp2, R, Ret);
5533 MI.eraseFromParent();
5565 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5566 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5567 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5568 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5569 .addUse(RHSExt.getReg(0))
5571 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5573 if (ST.hasMadMacF32Insts()) {
5574 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5575 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5576 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5578 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5579 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5580 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5582 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5583 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5584 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5585 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5586 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5587 .addUse(RDst.getReg(0))
5592 MI.eraseFromParent();
5605 unsigned SPDenormMode =
5608 if (ST.hasDenormModeInst()) {
5610 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5612 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5613 B.buildInstr(AMDGPU::S_DENORM_MODE)
5614 .addImm(NewDenormModeValue);
5617 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5618 .addImm(SPDenormMode)
5640 auto One =
B.buildFConstant(
S32, 1.0f);
5642 auto DenominatorScaled =
5643 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5648 auto NumeratorScaled =
5649 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5655 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5656 .addUse(DenominatorScaled.getReg(0))
5658 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5661 const bool HasDynamicDenormals =
5666 if (!PreservesDenormals) {
5667 if (HasDynamicDenormals) {
5669 B.buildInstr(AMDGPU::S_GETREG_B32)
5670 .addDef(SavedSPDenormMode)
5676 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5677 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5678 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5679 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5680 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5681 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5683 if (!PreservesDenormals) {
5684 if (HasDynamicDenormals) {
5685 assert(SavedSPDenormMode);
5686 B.buildInstr(AMDGPU::S_SETREG_B32)
5687 .addReg(SavedSPDenormMode)
5693 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5694 .addUse(Fma4.getReg(0))
5695 .addUse(Fma1.getReg(0))
5696 .addUse(Fma3.getReg(0))
5697 .addUse(NumeratorScaled.getReg(1))
5700 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5701 .addUse(Fmas.getReg(0))
5706 MI.eraseFromParent();
5725 auto One =
B.buildFConstant(
S64, 1.0);
5727 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5733 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5735 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5736 .addUse(DivScale0.getReg(0))
5739 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5740 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5741 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5743 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5749 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5750 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5751 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5754 if (!ST.hasUsableDivScaleConditionOutput()) {
5760 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5761 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5762 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5763 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5766 Scale1Unmerge.getReg(1));
5768 Scale0Unmerge.getReg(1));
5769 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5771 Scale = DivScale1.getReg(1);
5774 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5775 .addUse(Fma4.getReg(0))
5776 .addUse(Fma3.getReg(0))
5777 .addUse(
Mul.getReg(0))
5781 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5782 .addUse(Fmas.getReg(0))
5787 MI.eraseFromParent();
5802 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5805 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5809 if (ST.hasFractBug()) {
5810 auto Fabs =
B.buildFAbs(Ty, Val);
5814 auto Zero =
B.buildConstant(InstrExpTy, 0);
5815 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5816 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5819 B.buildCopy(Res0, Mant);
5820 B.buildSExtOrTrunc(Res1, Exp);
5822 MI.eraseFromParent();
5837 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5840 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5841 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5842 auto C2 =
B.buildFConstant(
S32, 1.0f);
5845 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5847 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5849 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5850 .addUse(Mul0.getReg(0))
5853 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5855 B.buildFMul(Res, Sel, Mul1, Flags);
5857 MI.eraseFromParent();
5866 unsigned Flags =
MI.getFlags();
5867 assert(!ST.has16BitInsts());
5869 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5870 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5871 .addUse(Ext.getReg(0))
5873 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5874 MI.eraseFromParent();
5884 const unsigned Flags =
MI.getFlags();
5893 MI.eraseFromParent();
5897 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5899 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5900 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5901 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5906 .addUse(SqrtX.getReg(0))
5909 auto NegOne =
B.buildConstant(I32, -1);
5910 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5912 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5913 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5915 auto PosOne =
B.buildConstant(I32, 1);
5916 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5918 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5919 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5921 auto Zero =
B.buildFConstant(
F32, 0.0f);
5925 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5929 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5932 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5933 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5935 auto Half =
B.buildFConstant(
F32, 0.5f);
5936 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5937 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5938 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5939 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5940 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5941 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5942 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5943 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5946 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5948 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5950 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5953 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5955 MI.eraseFromParent();
5990 unsigned Flags =
MI.getFlags();
5995 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
5997 ZeroInt =
B.buildConstant(
S32, 0).getReg(0);
6001 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
6002 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
6003 SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags).getReg(0);
6006 auto SqrtY =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX);
6008 auto Half =
B.buildFConstant(
F64, 0.5);
6009 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
6010 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
6012 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
6013 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
6015 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
6016 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
6018 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
6019 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
6021 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
6023 Register SqrtRet = SqrtS2.getReg(0);
6025 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
6026 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
6027 auto SqrtD2 =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
6030 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
6031 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
6032 SqrtRet =
B.buildFLdexp(
F64, SqrtD2, ScaleDown, Flags).getReg(0);
6037 auto ZeroFP =
B.buildFConstant(
F64, 0.0);
6046 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6048 MI.eraseFromParent();
6079 auto Flags =
MI.getFlags();
6091 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6101 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6102 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6107 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6109 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6110 MI.eraseFromParent();
6122 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6123 IID == Intrinsic::amdgcn_permlanex16;
6124 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6125 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6129 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
6131 case Intrinsic::amdgcn_readfirstlane:
6132 case Intrinsic::amdgcn_permlane64:
6133 return LaneOp.getReg(0);
6134 case Intrinsic::amdgcn_readlane:
6135 case Intrinsic::amdgcn_set_inactive:
6136 case Intrinsic::amdgcn_set_inactive_chain_arg:
6137 return LaneOp.addUse(Src1).getReg(0);
6138 case Intrinsic::amdgcn_writelane:
6139 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6140 case Intrinsic::amdgcn_permlane16:
6141 case Intrinsic::amdgcn_permlanex16: {
6143 int64_t Src4 =
MI.getOperand(6).getImm();
6144 int64_t Src5 =
MI.getOperand(7).getImm();
6145 return LaneOp.addUse(Src1)
6152 case Intrinsic::amdgcn_mov_dpp8:
6153 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
6154 case Intrinsic::amdgcn_update_dpp:
6155 return LaneOp.addUse(Src1)
6156 .addImm(
MI.getOperand(4).getImm())
6157 .addImm(
MI.getOperand(5).getImm())
6158 .addImm(
MI.getOperand(6).getImm())
6159 .addImm(
MI.getOperand(7).getImm())
6169 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6170 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6171 Src1 =
MI.getOperand(3).getReg();
6172 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
6173 Src2 =
MI.getOperand(4).getReg();
6178 unsigned Size = Ty.getSizeInBits();
6180 unsigned SplitSize = 32;
6181 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
6182 ST.hasDPALU_DPP() &&
6186 if (
Size == SplitSize) {
6192 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
6194 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6197 if (IID == Intrinsic::amdgcn_writelane)
6200 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
6201 B.buildTrunc(DstReg, LaneOpDst);
6202 MI.eraseFromParent();
6206 if (
Size % SplitSize != 0)
6210 bool NeedsBitcast =
false;
6211 if (Ty.isVector()) {
6214 if (EltSize == SplitSize) {
6215 PartialResTy = EltTy;
6216 }
else if (EltSize == 16 || EltSize == 32) {
6217 unsigned NElem = SplitSize / EltSize;
6221 NeedsBitcast =
true;
6226 unsigned NumParts =
Size / SplitSize;
6230 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6231 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
6233 if (IID == Intrinsic::amdgcn_writelane)
6234 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
6236 for (
unsigned i = 0; i < NumParts; ++i) {
6237 Src0 = Src0Parts.
getReg(i);
6239 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6240 Src1 = Src1Parts.
getReg(i);
6242 if (IID == Intrinsic::amdgcn_writelane)
6243 Src2 = Src2Parts.
getReg(i);
6245 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6249 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
6252 B.buildMergeLikeInstr(DstReg, PartialRes);
6254 MI.eraseFromParent();
6262 ST.getTargetLowering()->getImplicitParameterOffset(
6272 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6273 B.buildConstant(IdxTy,
Offset).getReg(0));
6284 Register Pointer =
MI.getOperand(2).getReg();
6286 Register NumRecords =
MI.getOperand(4).getReg();
6292 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6294 auto ExtStride =
B.buildAnyExt(
S32, Stride);
6296 if (ST.has45BitNumRecordsBufferResource()) {
6301 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
6302 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
6303 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
6304 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
6308 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
6309 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
6310 auto ExtShiftedStride =
6311 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
6312 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
6313 auto ExtShiftedFlags =
6314 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
6315 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
6317 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
6318 B.buildMergeValues(Result, {LowHalf, HighHalf});
6320 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
6321 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
6322 auto LowHalf = Unmerge.getReg(0);
6323 auto HighHalf = Unmerge.getReg(1);
6325 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
6326 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
6327 auto ShiftConst =
B.buildConstant(
S32, 16);
6328 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
6329 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
6330 Register NewHighHalfReg = NewHighHalf.getReg(0);
6331 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6334 MI.eraseFromParent();
6351 MI.eraseFromParent();
6359 std::optional<uint32_t> KnownSize =
6361 if (KnownSize.has_value())
6362 B.buildConstant(DstReg, *KnownSize);
6380 MI.eraseFromParent();
6387 unsigned AddrSpace)
const {
6389 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6393 ST.hasGloballyAddressableScratch()) {
6395 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6396 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6398 MRI.
setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6400 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6402 B.buildConstant(
S32, 1u << 26));
6407 MI.eraseFromParent();
6417std::pair<Register, unsigned>
6429 bool CheckNUW = ST.hasGFX1250Insts();
6431 MRI, OrigOffset,
nullptr, CheckNUW);
6435 BaseReg =
B.buildPtrToInt(MRI.
getType(OrigOffset), BaseReg).getReg(0);
6445 unsigned Overflow = ImmOffset & ~MaxImm;
6446 ImmOffset -= Overflow;
6447 if ((int32_t)Overflow < 0) {
6448 Overflow += ImmOffset;
6452 if (Overflow != 0) {
6454 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6456 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6457 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6462 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6464 return std::pair(BaseReg, ImmOffset);
6471 bool ImageStore)
const {
6477 if (ST.hasUnpackedD16VMem()) {
6478 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6481 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6482 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6490 if (ImageStore && ST.hasImageStoreD16Bug()) {
6493 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6495 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6502 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6503 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6505 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6513 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6514 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6516 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6533 bool IsFormat)
const {
6545 VData =
B.buildBitcast(Ty, VData).getReg(0);
6553 if (Ty.isVector()) {
6554 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6566 bool IsFormat)
const {
6573 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6588 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6591 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6595 VIndex =
MI.getOperand(3).getReg();
6598 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6601 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6602 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6606 Format =
MI.getOperand(5 + OpOffset).getImm();
6610 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6616 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6617 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6618 }
else if (IsFormat) {
6619 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6620 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6624 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6627 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6630 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6635 auto MIB =
B.buildInstr(
Opc)
6646 MIB.addImm(AuxiliaryData)
6647 .addImm(HasVIndex ? -1 : 0)
6648 .addMemOperand(MMO);
6650 MI.eraseFromParent();
6656 unsigned ImmOffset,
unsigned Format,
6659 auto MIB =
B.buildInstr(
Opc)
6670 MIB.addImm(AuxiliaryData)
6671 .addImm(HasVIndex ? -1 : 0)
6672 .addMemOperand(MMO);
6678 bool IsTyped)
const {
6692 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6693 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6695 StatusDst =
MI.getOperand(1).getReg();
6700 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6703 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6706 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6709 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6712 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6715 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6716 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6720 Format =
MI.getOperand(5 + OpOffset).getImm();
6724 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6734 Dst =
MI.getOperand(0).getReg();
6735 B.setInsertPt(
B.getMBB(),
MI);
6742 Dst =
MI.getOperand(0).getReg();
6743 B.setInsertPt(
B.getMBB(),
MI);
6747 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6748 const bool Unpacked = ST.hasUnpackedD16VMem();
6758 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6759 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6760 }
else if (IsFormat) {
6764 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6766 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6767 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6772 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6773 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6776 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6777 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6780 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6781 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6787 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6788 unsigned NumLoadDWords = NumValueDWords + 1;
6790 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6792 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6794 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6795 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6796 B.buildTrunc(Dst, ExtDst);
6797 }
else if (NumValueDWords == 1) {
6798 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6801 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6802 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6804 B.buildUnmerge(LoadElts, LoadDstReg);
6806 B.buildMergeLikeInstr(Dst, LoadElts);
6809 (IsD16 && !Ty.isVector())) {
6810 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6812 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6813 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6814 B.buildTrunc(Dst, LoadDstReg);
6815 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6817 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6819 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6820 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6822 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6824 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6825 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6826 B.buildMergeLikeInstr(Dst, Repack);
6829 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6832 MI.eraseFromParent();
6838 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6839 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6840 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6841 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6842 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6843 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6844 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6845 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6846 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6847 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6848 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6849 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6850 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6851 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6852 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6853 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6854 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6855 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6856 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6857 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6858 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6859 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6860 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6861 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6862 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6863 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6864 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6865 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6866 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6867 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6868 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6869 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6870 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6871 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6872 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6873 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6874 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6875 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6876 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6877 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6878 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6879 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6880 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6881 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6882 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6883 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6884 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6885 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6886 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6887 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6888 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6889 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6890 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6891 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6892 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6893 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6895 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6896 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6897 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6898 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6899 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6900 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6901 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6902 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6903 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6904 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6905 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6906 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6907 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6908 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6909 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6910 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6911 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6912 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6913 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6914 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6915 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6916 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6917 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6918 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6919 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6920 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6921 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6922 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6923 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6924 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6925 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6926 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6927 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6936 const bool IsCmpSwap =
6937 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6938 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6939 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6940 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6951 CmpVal =
MI.getOperand(3).getReg();
6956 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6957 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6960 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6963 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6966 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6969 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6970 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6971 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6990 .addImm(AuxiliaryData)
6991 .addImm(HasVIndex ? -1 : 0)
6992 .addMemOperand(MMO);
6994 MI.eraseFromParent();
7004 bool IsA16,
bool IsG16) {
7020 (
B.getMRI()->getType(AddrReg) ==
S16)) {
7025 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7029 "Bias needs to be converted to 16 bit in A16 mode");
7031 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
7037 if (((
I + 1) >= EndIdx) ||
7044 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
7046 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7051 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7062 int DimIdx,
int NumVAddrs) {
7066 for (
int I = 0;
I != NumVAddrs; ++
I) {
7068 if (
SrcOp.isReg()) {
7074 int NumAddrRegs = AddrRegs.
size();
7075 if (NumAddrRegs != 1) {
7078 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7081 for (
int I = 1;
I != NumVAddrs; ++
I) {
7084 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
7106 const unsigned NumDefs =
MI.getNumExplicitDefs();
7107 const unsigned ArgOffset = NumDefs + 1;
7108 bool IsTFE = NumDefs == 2;
7126 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7130 const bool IsAtomicPacked16Bit =
7131 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7132 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7140 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
7141 const bool IsA16 = AddrTy ==
S16;
7142 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
7145 if (!BaseOpcode->
Atomic) {
7146 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
7149 }
else if (DMask != 0) {
7151 }
else if (!IsTFE && !BaseOpcode->
Store) {
7153 B.buildUndef(
MI.getOperand(0));
7154 MI.eraseFromParent();
7162 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7163 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7164 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7165 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7166 unsigned NewOpcode = LoadOpcode;
7167 if (BaseOpcode->
Store)
7168 NewOpcode = StoreOpcode;
7170 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7173 MI.setDesc(
B.getTII().get(NewOpcode));
7177 if (IsTFE && DMask == 0) {
7180 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
7183 if (BaseOpcode->
Atomic) {
7188 if (Ty.isVector() && !IsAtomicPacked16Bit)
7195 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
7196 MI.getOperand(2).setReg(
Concat.getReg(0));
7197 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7201 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
7204 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7210 if (IsA16 && !ST.hasA16()) {
7215 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
7216 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7218 if (IsA16 || IsG16) {
7226 const bool UseNSA = ST.hasNSAEncoding() &&
7227 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
7228 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
7229 const bool UsePartialNSA =
7230 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
7232 if (UsePartialNSA) {
7236 auto Concat =
B.buildConcatVectors(
7237 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7238 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
7239 PackedRegs.
resize(NSAMaxSize);
7240 }
else if (!UseNSA && PackedRegs.
size() > 1) {
7242 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
7243 PackedRegs[0] =
Concat.getReg(0);
7247 const unsigned NumPacked = PackedRegs.
size();
7250 if (!
SrcOp.isReg()) {
7260 SrcOp.setReg(AMDGPU::NoRegister);
7277 const bool UseNSA = ST.hasNSAEncoding() &&
7278 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7279 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7280 const bool UsePartialNSA =
7281 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7283 if (UsePartialNSA) {
7285 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
7287 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
7302 if (!Ty.isVector() || !IsD16)
7306 if (RepackedReg != VData) {
7307 MI.getOperand(1).setReg(RepackedReg);
7315 const int NumElts = Ty.
isVector() ? Ty.getNumElements() : 1;
7318 if (NumElts < DMaskLanes)
7321 if (NumElts > 4 || DMaskLanes > 4)
7331 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7332 const LLT AdjustedTy =
7348 if (IsD16 && ST.hasUnpackedD16VMem()) {
7355 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
7356 unsigned RoundedSize = 32 * RoundedElts;
7360 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
7365 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
7371 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
7375 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7376 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
7380 MI.getOperand(0).setReg(NewResultReg);
7388 Dst1Reg =
MI.getOperand(1).getReg();
7393 MI.removeOperand(1);
7397 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7406 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7408 if (ResultNumRegs == 1) {
7410 ResultRegs[0] = NewResultReg;
7413 for (
int I = 0;
I != NumDataRegs; ++
I)
7415 B.buildUnmerge(ResultRegs, NewResultReg);
7420 ResultRegs.
resize(NumDataRegs);
7425 if (IsD16 && !Ty.isVector()) {
7426 B.buildTrunc(DstReg, ResultRegs[0]);
7431 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7432 B.buildBitcast(DstReg, ResultRegs[0]);
7444 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7446 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7447 }
else if (ST.hasUnpackedD16VMem()) {
7449 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7453 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7457 for (
int I = 0;
I != NumElts; ++
I)
7464 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7465 B.buildBuildVector(DstReg, ResultRegs);
7469 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7470 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7476 if (ResultRegs.
size() == 1) {
7477 NewResultReg = ResultRegs[0];
7478 }
else if (ResultRegs.
size() == 2) {
7480 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7488 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7490 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7495 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7496 B.buildConcatVectors(DstReg, ResultRegs);
7505 Register OrigDst =
MI.getOperand(0).getReg();
7507 LLT Ty =
B.getMRI()->getType(OrigDst);
7508 unsigned Size = Ty.getSizeInBits();
7511 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7513 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7514 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7517 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7519 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7528 B.setInsertPt(
B.getMBB(),
MI);
7533 B.setInsertPt(
B.getMBB(),
MI);
7539 MI.setDesc(
B.getTII().get(
Opc));
7540 MI.removeOperand(1);
7543 const unsigned MemSize = (
Size + 7) / 8;
7544 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7551 MI.addMemOperand(MF, MMO);
7552 if (Dst != OrigDst) {
7553 MI.getOperand(0).setReg(Dst);
7554 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7555 B.buildTrunc(OrigDst, Dst);
7577 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7578 MI.removeOperand(0);
7588 if (!ST.hasTrapHandler() ||
7592 return ST.supportsGetDoorbellID() ?
7605 MI.eraseFromParent();
7615 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7617 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7621 MI.eraseFromParent();
7630 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7637 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7657 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7660 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7661 B.buildCopy(SGPR01, Temp);
7662 B.buildInstr(AMDGPU::S_TRAP)
7665 MI.eraseFromParent();
7676 B.buildCopy(SGPR01, LiveIn);
7677 B.buildInstr(AMDGPU::S_TRAP)
7681 MI.eraseFromParent();
7690 if (ST.hasPrivEnabledTrap2NopBug()) {
7691 ST.getInstrInfo()->insertSimulatedTrap(MRI,
B.getMBB(),
MI,
7693 MI.eraseFromParent();
7697 B.buildInstr(AMDGPU::S_TRAP)
7699 MI.eraseFromParent();
7708 if (!ST.hasTrapHandler() ||
7712 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7715 B.buildInstr(AMDGPU::S_TRAP)
7719 MI.eraseFromParent();
7732 Register NodePtr =
MI.getOperand(2).getReg();
7733 Register RayExtent =
MI.getOperand(3).getReg();
7734 Register RayOrigin =
MI.getOperand(4).getReg();
7736 Register RayInvDir =
MI.getOperand(6).getReg();
7739 if (!ST.hasGFX10_AEncoding()) {
7742 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7751 const unsigned NumVDataDwords = 4;
7752 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7753 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7755 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7757 const unsigned BaseOpcodes[2][2] = {
7758 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7759 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7760 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7764 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7765 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7766 : AMDGPU::MIMGEncGfx10NSA,
7767 NumVDataDwords, NumVAddrDwords);
7771 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7772 : AMDGPU::MIMGEncGfx10Default,
7773 NumVDataDwords, NumVAddrDwords);
7778 if (UseNSA && IsGFX11Plus) {
7780 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7781 auto Merged =
B.buildMergeLikeInstr(
7782 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7783 Ops.push_back(Merged.getReg(0));
7786 Ops.push_back(NodePtr);
7787 Ops.push_back(RayExtent);
7788 packLanes(RayOrigin);
7791 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7792 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7793 auto MergedDir =
B.buildMergeLikeInstr(
7796 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7797 UnmergeRayDir.getReg(0)}))
7800 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7801 UnmergeRayDir.getReg(1)}))
7804 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7805 UnmergeRayDir.getReg(2)}))
7807 Ops.push_back(MergedDir.getReg(0));
7810 packLanes(RayInvDir);
7814 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7815 Ops.push_back(Unmerge.getReg(0));
7816 Ops.push_back(Unmerge.getReg(1));
7818 Ops.push_back(NodePtr);
7820 Ops.push_back(RayExtent);
7823 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7824 Ops.push_back(Unmerge.getReg(0));
7825 Ops.push_back(Unmerge.getReg(1));
7826 Ops.push_back(Unmerge.getReg(2));
7829 packLanes(RayOrigin);
7831 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7832 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7836 B.buildMergeLikeInstr(R1,
7837 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7838 B.buildMergeLikeInstr(
7839 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7840 B.buildMergeLikeInstr(
7841 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7847 packLanes(RayInvDir);
7854 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7856 Ops.push_back(MergedOps);
7859 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7868 .addImm(IsA16 ? 1 : 0)
7871 MI.eraseFromParent();
7881 Register DstOrigin =
MI.getOperand(1).getReg();
7883 Register NodePtr =
MI.getOperand(4).getReg();
7884 Register RayExtent =
MI.getOperand(5).getReg();
7885 Register InstanceMask =
MI.getOperand(6).getReg();
7886 Register RayOrigin =
MI.getOperand(7).getReg();
7888 Register Offsets =
MI.getOperand(9).getReg();
7889 Register TDescr =
MI.getOperand(10).getReg();
7891 if (!ST.hasBVHDualAndBVH8Insts()) {
7894 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7899 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7900 const unsigned NumVDataDwords = 10;
7901 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7903 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7904 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7905 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7908 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7909 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7911 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7912 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7918 .addUse(RayExtentInstanceMaskVec.getReg(0))
7925 MI.eraseFromParent();
7934 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7935 MI.eraseFromParent();
7942 if (!ST.hasArchitectedSGPRs())
7946 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7947 auto LSB =
B.buildConstant(
S32, 25);
7948 auto Width =
B.buildConstant(
S32, 5);
7949 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7950 MI.eraseFromParent();
7958 unsigned Width)
const {
7962 MRI.
setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7963 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7966 MI.eraseFromParent();
7984 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7988 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7991 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7992 MI.eraseFromParent();
8003 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
8007 .addReg(Unmerge.getReg(0));
8011 .addReg(Unmerge.getReg(1));
8012 MI.eraseFromParent();
8024 case Intrinsic::sponentry:
8030 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8033 B.buildIntToPtr(DstReg, TmpReg);
8034 MI.eraseFromParent();
8036 int FI =
B.getMF().getFrameInfo().CreateFixedObject(
8038 B.buildFrameIndex(
MI.getOperand(0), FI);
8039 MI.eraseFromParent();
8042 case Intrinsic::amdgcn_if:
8043 case Intrinsic::amdgcn_else: {
8046 bool Negated =
false;
8058 std::swap(CondBrTarget, UncondBrTarget);
8060 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8061 if (IntrID == Intrinsic::amdgcn_if) {
8062 B.buildInstr(AMDGPU::SI_IF)
8065 .addMBB(UncondBrTarget);
8067 B.buildInstr(AMDGPU::SI_ELSE)
8070 .addMBB(UncondBrTarget);
8079 B.buildBr(*CondBrTarget);
8084 MI.eraseFromParent();
8085 BrCond->eraseFromParent();
8091 case Intrinsic::amdgcn_loop: {
8094 bool Negated =
false;
8104 std::swap(CondBrTarget, UncondBrTarget);
8106 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8107 B.buildInstr(AMDGPU::SI_LOOP)
8109 .addMBB(UncondBrTarget);
8114 B.buildBr(*CondBrTarget);
8116 MI.eraseFromParent();
8117 BrCond->eraseFromParent();
8124 case Intrinsic::amdgcn_addrspacecast_nonnull:
8126 case Intrinsic::amdgcn_make_buffer_rsrc:
8128 case Intrinsic::amdgcn_kernarg_segment_ptr:
8131 B.buildConstant(
MI.getOperand(0).getReg(), 0);
8132 MI.eraseFromParent();
8138 case Intrinsic::amdgcn_implicitarg_ptr:
8140 case Intrinsic::amdgcn_workitem_id_x:
8143 case Intrinsic::amdgcn_workitem_id_y:
8146 case Intrinsic::amdgcn_workitem_id_z:
8149 case Intrinsic::amdgcn_workgroup_id_x:
8154 case Intrinsic::amdgcn_workgroup_id_y:
8159 case Intrinsic::amdgcn_workgroup_id_z:
8164 case Intrinsic::amdgcn_cluster_id_x:
8165 return ST.hasClusters() &&
8168 case Intrinsic::amdgcn_cluster_id_y:
8169 return ST.hasClusters() &&
8172 case Intrinsic::amdgcn_cluster_id_z:
8173 return ST.hasClusters() &&
8176 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8177 return ST.hasClusters() &&
8180 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8181 return ST.hasClusters() &&
8184 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8185 return ST.hasClusters() &&
8188 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8189 return ST.hasClusters() &&
8191 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8192 return ST.hasClusters() &&
8195 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8196 return ST.hasClusters() &&
8199 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8200 return ST.hasClusters() &&
8203 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8204 return ST.hasClusters() &&
8208 case Intrinsic::amdgcn_wave_id:
8210 case Intrinsic::amdgcn_lds_kernel_id:
8213 case Intrinsic::amdgcn_dispatch_ptr:
8216 case Intrinsic::amdgcn_queue_ptr:
8219 case Intrinsic::amdgcn_implicit_buffer_ptr:
8222 case Intrinsic::amdgcn_dispatch_id:
8225 case Intrinsic::r600_read_ngroups_x:
8229 case Intrinsic::r600_read_ngroups_y:
8232 case Intrinsic::r600_read_ngroups_z:
8235 case Intrinsic::r600_read_local_size_x:
8238 case Intrinsic::r600_read_local_size_y:
8242 case Intrinsic::r600_read_local_size_z:
8245 case Intrinsic::amdgcn_fdiv_fast:
8247 case Intrinsic::amdgcn_is_shared:
8249 case Intrinsic::amdgcn_is_private:
8251 case Intrinsic::amdgcn_wavefrontsize: {
8252 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
8253 MI.eraseFromParent();
8256 case Intrinsic::amdgcn_s_buffer_load:
8258 case Intrinsic::amdgcn_raw_buffer_store:
8259 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8260 case Intrinsic::amdgcn_struct_buffer_store:
8261 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8263 case Intrinsic::amdgcn_raw_buffer_store_format:
8264 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8265 case Intrinsic::amdgcn_struct_buffer_store_format:
8266 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8268 case Intrinsic::amdgcn_raw_tbuffer_store:
8269 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8270 case Intrinsic::amdgcn_struct_tbuffer_store:
8271 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8273 case Intrinsic::amdgcn_raw_buffer_load:
8274 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8275 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8276 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8277 case Intrinsic::amdgcn_struct_buffer_load:
8278 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8279 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8280 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8282 case Intrinsic::amdgcn_raw_buffer_load_format:
8283 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8284 case Intrinsic::amdgcn_struct_buffer_load_format:
8285 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8287 case Intrinsic::amdgcn_raw_tbuffer_load:
8288 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8289 case Intrinsic::amdgcn_struct_tbuffer_load:
8290 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8292 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8293 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8294 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8295 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8296 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8297 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8298 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8299 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8300 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8301 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8302 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8303 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8304 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8305 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8306 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8307 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8308 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8309 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8310 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8311 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8312 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8313 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8314 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8315 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8316 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8317 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8318 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8319 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8320 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8321 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8322 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8323 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8324 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8325 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8326 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8327 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8328 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8329 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8330 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8331 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8332 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8333 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8334 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8335 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8336 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8337 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8338 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8339 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8340 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8341 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8342 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8343 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8344 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8345 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8346 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8347 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8348 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8349 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8350 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8351 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8352 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8353 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8354 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8355 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8356 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8357 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8358 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8359 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8360 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8361 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8362 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8363 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8365 case Intrinsic::amdgcn_rsq_clamp:
8367 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8369 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8370 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8372 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8373 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8374 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8375 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8376 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8377 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8378 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8379 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8383 if (IndexArgTy !=
S64) {
8384 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(
S64, Index)
8385 :
B.buildAnyExt(
S64, Index);
8386 MI.getOperand(5).setReg(NewIndex.getReg(0));
8390 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8391 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8392 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8393 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8394 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8395 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8396 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8397 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8401 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
8404 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8405 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8406 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8407 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8408 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8409 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8410 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8411 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8412 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8414 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8418 if (IndexArgTy != IdxTy) {
8419 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(IdxTy, Index)
8420 :
B.buildAnyExt(IdxTy, Index);
8421 MI.getOperand(7).setReg(NewIndex.getReg(0));
8426 case Intrinsic::amdgcn_fmed3: {
8432 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8433 MI.removeOperand(1);
8437 case Intrinsic::amdgcn_readlane:
8438 case Intrinsic::amdgcn_writelane:
8439 case Intrinsic::amdgcn_readfirstlane:
8440 case Intrinsic::amdgcn_permlane16:
8441 case Intrinsic::amdgcn_permlanex16:
8442 case Intrinsic::amdgcn_permlane64:
8443 case Intrinsic::amdgcn_set_inactive:
8444 case Intrinsic::amdgcn_set_inactive_chain_arg:
8445 case Intrinsic::amdgcn_mov_dpp8:
8446 case Intrinsic::amdgcn_update_dpp:
8448 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8450 case Intrinsic::amdgcn_dead: {
8454 MI.eraseFromParent();
8457 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8458 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8459 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8460 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8461 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8462 MI.eraseFromParent();
8464 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8465 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8466 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8467 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8468 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8469 MI.eraseFromParent();
8471 case Intrinsic::amdgcn_flat_load_monitor_b32:
8472 case Intrinsic::amdgcn_flat_load_monitor_b64:
8473 case Intrinsic::amdgcn_flat_load_monitor_b128:
8474 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8475 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8476 .add(
MI.getOperand(0))
8477 .add(
MI.getOperand(2))
8478 .addMemOperand(*
MI.memoperands_begin());
8479 MI.eraseFromParent();
8481 case Intrinsic::amdgcn_global_load_monitor_b32:
8482 case Intrinsic::amdgcn_global_load_monitor_b64:
8483 case Intrinsic::amdgcn_global_load_monitor_b128:
8484 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8485 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8486 .add(
MI.getOperand(0))
8487 .add(
MI.getOperand(2))
8488 .addMemOperand(*
MI.memoperands_begin());
8489 MI.eraseFromParent();
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
bool isModuleEntryFunction() const
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isBottomOfStack() const
bool isEntryFunction() const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
ConstantFP - Floating Point Values [float, double].
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.