37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
40#define DEBUG_TYPE "amdgpu-legalinfo"
50 "amdgpu-global-isel-new-legality",
51 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
67 unsigned Bits = Ty.getSizeInBits();
77 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
91 const LLT Ty = Query.Types[TypeIdx];
98 const LLT Ty = Query.Types[TypeIdx];
100 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
106 const LLT Ty = Query.Types[TypeIdx];
108 return std::pair(TypeIdx,
115 const LLT Ty = Query.Types[TypeIdx];
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (
Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
129 const LLT Ty = Query.Types[TypeIdx];
132 const int Size = Ty.getSizeInBits();
134 const int NextMul32 = (
Size + 31) / 32;
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
154 const LLT Ty = Query.Types[TypeIdx];
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
159 assert(EltSize == 32 || EltSize == 64);
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
168 return std::pair(TypeIdx,
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
188 const unsigned Size = Ty.getSizeInBits();
201 const LLT Ty = Query.Types[TypeIdx];
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
218 const LLT QueryTy = Query.Types[TypeIdx];
225 const LLT QueryTy = Query.Types[TypeIdx];
232 const LLT QueryTy = Query.Types[TypeIdx];
238 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
244 return EltSize == 16 || EltSize % 32 == 0;
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
280 LLT Ty = Query.Types[TypeIdx];
288 const LLT QueryTy = Query.Types[TypeIdx];
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
378 (ST.useRealTrue16Insts() && Ty ==
S16) ||
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
413 bool IsLoad,
bool IsAtomic) {
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
430 return IsLoad ? 512 : 128;
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
444 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
446 unsigned RegSize = Ty.getSizeInBits();
449 unsigned AS = Query.
Types[1].getAddressSpace();
456 if (Ty.isVector() && MemSize !=
RegSize)
463 if (IsLoad && MemSize <
Size)
464 MemSize = std::max(MemSize,
Align);
484 if (!ST.hasDwordx3LoadStores())
497 if (AlignBits < MemSize) {
500 Align(AlignBits / 8)))
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
556 const unsigned Size = Ty.getSizeInBits();
557 if (
Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
564 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
573 uint64_t AlignInBits,
unsigned AddrSpace,
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
594 if (AlignInBits < RoundedSize)
601 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
613 Query.
Types[1].getAddressSpace(), Opcode);
633 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
639 for (
unsigned I = 0;
I < NumParts; ++
I)
641 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
647 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
648 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
669 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
670 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
671 for (
unsigned I = 0;
I < NumParts; ++
I)
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
675 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
695 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
708 const LLT BufferStridedPtr =
711 const LLT CodePtr = FlatPtr;
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
723 const std::initializer_list<LLT> FPTypesBase = {
727 const std::initializer_list<LLT> FPTypes16 = {
731 const std::initializer_list<LLT> FPTypesPK16 = {
735 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
758 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
760 if (ST.hasScalarAddSub64()) {
763 .clampMaxNumElementsStrict(0,
S16, 2)
771 .clampMaxNumElementsStrict(0,
S16, 2)
778 if (ST.hasScalarSMulU64()) {
781 .clampMaxNumElementsStrict(0,
S16, 2)
789 .clampMaxNumElementsStrict(0,
S16, 2)
799 .minScalarOrElt(0,
S16)
804 }
else if (ST.has16BitInsts()) {
838 .widenScalarToNextMultipleOf(0, 32)
848 if (ST.hasMad64_32())
853 if (ST.hasIntClamp()) {
876 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
886 if (ST.hasVOP3PInsts()) {
888 .clampMaxNumElements(0,
S8, 2)
909 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
921 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
928 .clampScalar(0,
S16,
S64);
961 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
962 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
969 if (ST.has16BitInsts()) {
970 if (ST.hasVOP3PInsts())
973 FPOpActions.legalFor({
S16});
975 TrigActions.customFor({
S16});
976 FDIVActions.customFor({
S16});
979 if (ST.hasPackedFP32Ops()) {
980 FPOpActions.legalFor({
V2S32});
981 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
984 if (ST.hasPackedFP64Ops()) {
985 FPOpActions.legalFor({
V2S64});
986 FPOpActions.clampMaxNumElementsStrict(0,
S64, 2);
989 auto &MinNumMaxNumIeee =
992 if (ST.hasVOP3PInsts()) {
993 MinNumMaxNumIeee.legalFor(FPTypesPK16)
995 .clampMaxNumElements(0,
S16, 2)
998 }
else if (ST.has16BitInsts()) {
999 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
1001 MinNumMaxNumIeee.legalFor(FPTypesBase)
1002 .clampScalar(0,
S32,
S64)
1007 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1009 if (ST.hasVOP3PInsts()) {
1010 MinNumMaxNum.customFor(FPTypesPK16)
1012 .clampMaxNumElements(0,
S16, 2)
1013 .clampScalar(0,
S16,
S64)
1015 }
else if (ST.has16BitInsts()) {
1016 MinNumMaxNum.customFor(FPTypes16)
1017 .clampScalar(0,
S16,
S64)
1020 MinNumMaxNum.customFor(FPTypesBase)
1021 .clampScalar(0,
S32,
S64)
1025 if (ST.hasVOP3PInsts())
1042 .
legalFor(ST.hasPackedFP32Ops(), {V2S32})
1044 if (ST.hasPackedFP32Ops())
1048 if (ST.has16BitInsts()) {
1082 if (ST.hasFractBug()) {
1116 if (ST.hasCvtPkF16F32Inst()) {
1118 .clampMaxNumElements(0,
S16, 2);
1122 FPTruncActions.scalarize(0).lower();
1130 if (ST.has16BitInsts()) {
1144 if (ST.hasPackedFP32Ops())
1154 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1155 FMad.customFor({
S32,
S16});
1156 else if (ST.hasMadMacF32Insts())
1157 FMad.customFor({
S32});
1158 else if (ST.hasMadF16())
1159 FMad.customFor({
S16});
1164 if (ST.has16BitInsts()) {
1167 FRem.minScalar(0,
S32)
1176 .clampMaxNumElements(0,
S16, 2)
1195 if (ST.has16BitInsts())
1206 if (ST.has16BitInsts())
1219 .legalFor(ST.has16BitInsts(),{{S16, S16}})
1223 if (
ST.has16BitInsts())
1233 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1234 .clampScalar(0,
S16,
S64)
1238 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1244 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1248 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1249 .clampScalar(0,
S16,
S64)
1253 if (
ST.has16BitInsts()) {
1254 getActionDefinitionsBuilder(
1255 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1257 .clampScalar(0,
S16,
S64)
1260 getActionDefinitionsBuilder(
1261 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1263 .clampScalar(0,
S32,
S64)
1266 getActionDefinitionsBuilder(
1267 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1270 .clampScalar(0,
S32,
S64)
1274 getActionDefinitionsBuilder(G_PTR_ADD)
1280 getActionDefinitionsBuilder(G_PTRMASK)
1282 .scalarSameSizeAs(1, 0)
1286 getActionDefinitionsBuilder(G_ICMP)
1298 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1299 .legalForCartesianProduct(
1300 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1301 if (
ST.has16BitInsts()) {
1302 CmpBuilder.legalFor({{
S1,
S16}});
1313 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1315 if (
ST.hasSALUFloatInsts())
1324 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1325 if (
ST.has16BitInsts())
1326 ExpOps.customFor({{
S32}, {
S16}});
1328 ExpOps.customFor({
S32});
1329 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1332 getActionDefinitionsBuilder(G_FPOWI)
1333 .clampScalar(0, MinScalarFPTy,
S32)
1336 getActionDefinitionsBuilder(G_FLOG2)
1337 .legalFor(
ST.has16BitInsts(), {S16})
1342 getActionDefinitionsBuilder(G_FEXP2)
1343 .legalFor(
ST.has16BitInsts(), {S16})
1349 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1351 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1355 getActionDefinitionsBuilder(G_CTPOP)
1357 .clampScalar(0,
S32,
S32)
1358 .widenScalarToNextPow2(1, 32)
1359 .clampScalar(1,
S32,
S64)
1361 .widenScalarToNextPow2(0, 32);
1364 if (
ST.has16BitInsts())
1365 getActionDefinitionsBuilder(G_IS_FPCLASS)
1366 .legalForCartesianProduct({
S1}, FPTypes16)
1367 .widenScalarToNextPow2(1)
1371 getActionDefinitionsBuilder(G_IS_FPCLASS)
1372 .legalForCartesianProduct({
S1}, FPTypesBase)
1373 .lowerFor({
S1,
S16})
1374 .widenScalarToNextPow2(1)
1381 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1383 .clampScalar(0,
S32,
S32)
1384 .clampScalar(1,
S32,
S64)
1385 .widenScalarToNextPow2(0, 32)
1386 .widenScalarToNextPow2(1, 32)
1390 getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON)
1393 .clampScalar(0,
S32,
S32)
1394 .clampScalar(1,
S32,
S64)
1396 .widenScalarToNextPow2(0, 32)
1397 .widenScalarToNextPow2(1, 32);
1399 getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON)
1401 .clampScalar(0,
S32,
S32)
1402 .clampScalar(1,
S32,
S64)
1404 .widenScalarToNextPow2(0, 32)
1405 .widenScalarToNextPow2(1, 32);
1407 getActionDefinitionsBuilder(G_CTLS)
1410 .clampScalar(0,
S32,
S32)
1411 .clampScalar(1,
S32,
S32);
1415 getActionDefinitionsBuilder(G_BITREVERSE)
1417 .clampScalar(0,
S32,
S64)
1419 .widenScalarToNextPow2(0);
1421 if (
ST.has16BitInsts()) {
1422 getActionDefinitionsBuilder(G_BSWAP)
1424 .clampMaxNumElementsStrict(0,
S16, 2)
1427 .widenScalarToNextPow2(0)
1428 .clampScalar(0,
S16,
S32)
1431 if (
ST.hasVOP3PInsts()) {
1432 getActionDefinitionsBuilder(G_ABS)
1434 .clampMaxNumElements(0,
S16, 2)
1436 .widenScalarToNextPow2(0)
1439 if (
ST.hasMinMaxI64Insts()) {
1440 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1442 .clampMaxNumElements(0,
S16, 2)
1444 .widenScalarToNextPow2(0)
1448 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1450 .clampMaxNumElements(0,
S16, 2)
1452 .widenScalarToNextPow2(0)
1457 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1459 .widenScalarToNextPow2(0)
1466 getActionDefinitionsBuilder(G_BSWAP)
1471 .widenScalarToNextPow2(0)
1476 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1479 .widenScalarToNextPow2(0)
1484 getActionDefinitionsBuilder(G_INTTOPTR)
1486 .legalForCartesianProduct(AddrSpaces64, {
S64})
1487 .legalForCartesianProduct(AddrSpaces32, {
S32})
1500 getActionDefinitionsBuilder(G_PTRTOINT)
1502 .legalForCartesianProduct(AddrSpaces64, {
S64})
1503 .legalForCartesianProduct(AddrSpaces32, {
S32})
1516 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1520 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1521 bool IsLoad) ->
bool {
1525 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1539 unsigned NumRegs = (MemSize + 31) / 32;
1541 if (!
ST.hasDwordx3LoadStores())
1552 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1553 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1554 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1560 for (
unsigned Op : {G_LOAD, G_STORE}) {
1561 const bool IsStore =
Op == G_STORE;
1563 auto &Actions = getActionDefinitionsBuilder(
Op);
1566 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1569 {
S64, GlobalPtr,
S64, GlobalAlign32},
1572 {
S32, GlobalPtr,
S8, GlobalAlign8},
1573 {
S32, GlobalPtr,
S16, GlobalAlign16},
1575 {
S32, LocalPtr,
S32, 32},
1576 {
S64, LocalPtr,
S64, 32},
1578 {
S32, LocalPtr,
S8, 8},
1579 {
S32, LocalPtr,
S16, 16},
1582 {
S32, PrivatePtr,
S32, 32},
1583 {
S32, PrivatePtr,
S8, 8},
1584 {
S32, PrivatePtr,
S16, 16},
1587 {
S32, ConstantPtr,
S32, GlobalAlign32},
1590 {
S64, ConstantPtr,
S64, GlobalAlign32},
1591 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1593 Actions.legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1594 {{S16, GlobalPtr, S8, GlobalAlign8},
1595 {S16, GlobalPtr, S16, GlobalAlign16},
1596 {S16, LocalPtr, S8, 8},
1597 {S16, LocalPtr, S16, 16},
1598 {S16, PrivatePtr, S8, 8},
1599 {S16, PrivatePtr, S16, 16}});
1609 Actions.unsupportedIf(
1610 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1624 Actions.customIf(
typeIs(1, Constant32Ptr));
1650 return !Query.
Types[0].isVector() &&
1651 needToSplitMemOp(Query,
Op == G_LOAD);
1653 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1658 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1661 if (DstSize > MemSize)
1667 if (MemSize > MaxSize)
1675 return Query.
Types[0].isVector() &&
1676 needToSplitMemOp(Query,
Op == G_LOAD);
1678 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1692 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1693 if (MemSize > MaxSize) {
1697 if (MaxSize % EltSize == 0) {
1703 unsigned NumPieces = MemSize / MaxSize;
1707 if (NumPieces == 1 || NumPieces >= NumElts ||
1708 NumElts % NumPieces != 0)
1709 return std::pair(0, EltTy);
1717 return std::pair(0, EltTy);
1732 return std::pair(0, EltTy);
1737 .widenScalarToNextPow2(0)
1744 getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1745 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1746 {
S32, GlobalPtr,
S16, 2 * 8},
1747 {
S32, LocalPtr,
S8, 8},
1748 {
S32, LocalPtr,
S16, 16},
1749 {
S32, PrivatePtr,
S8, 8},
1750 {
S32, PrivatePtr,
S16, 16},
1751 {
S32, ConstantPtr,
S8, 8},
1752 {
S32, ConstantPtr,
S16, 2 * 8}})
1753 .legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1754 {{S16, GlobalPtr, S8, GlobalAlign8},
1755 {S16, LocalPtr, S8, GlobalAlign8},
1756 {S16, PrivatePtr, S8, GlobalAlign8},
1757 {S16, ConstantPtr, S8, GlobalAlign8}})
1762 if (
ST.hasFlatAddressSpace()) {
1763 ExtLoads.legalForTypesWithMemDesc(
1764 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1766 ExtLoads.legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1767 {{S16, FlatPtr, S8, GlobalAlign8}});
1775 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1777 ExtLoads.narrowScalarIf(
1784 ExtLoads.clampScalar(0,
S32,
S32)
1785 .widenScalarToNextPow2(0)
1788 auto &Atomics = getActionDefinitionsBuilder(
1789 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1790 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1791 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1792 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1793 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1794 {
S64, GlobalPtr}, {
S64, LocalPtr},
1795 {
S32, RegionPtr}, {
S64, RegionPtr}});
1796 if (
ST.hasFlatAddressSpace()) {
1797 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1801 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1802 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr}, {
S32, RegionPtr}});
1803 if (
ST.hasFlatAddressSpace()) {
1804 Atomics32.legalFor({{
S32, FlatPtr}});
1808 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1809 if (
ST.hasLDSFPAtomicAddF32()) {
1810 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1811 if (
ST.hasLdsAtomicAddF64())
1812 Atomic.legalFor({{
S64, LocalPtr}});
1813 if (
ST.hasAtomicDsPkAdd16Insts())
1814 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1816 if (
ST.hasAtomicFaddInsts())
1817 Atomic.legalFor({{
S32, GlobalPtr}});
1818 if (
ST.hasFlatAtomicFaddF32Inst())
1819 Atomic.legalFor({{
S32, FlatPtr}});
1821 if (
ST.hasGFX90AInsts() ||
ST.hasGFX1250Insts()) {
1832 if (
ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1833 ST.hasAtomicBufferGlobalPkAddF16Insts())
1834 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1835 if (
ST.hasAtomicGlobalPkAddBF16Inst())
1836 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1837 if (
ST.hasAtomicFlatPkAdd16Insts())
1838 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1843 auto &AtomicFMinFMax =
1844 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1845 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1847 if (
ST.hasAtomicFMinFMaxF32GlobalInsts())
1848 AtomicFMinFMax.legalFor({{
F32, GlobalPtr},{
F32, BufferFatPtr}});
1849 if (
ST.hasAtomicFMinFMaxF64GlobalInsts())
1850 AtomicFMinFMax.legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1851 if (
ST.hasAtomicFMinFMaxF32FlatInsts())
1852 AtomicFMinFMax.legalFor({
F32, FlatPtr});
1853 if (
ST.hasAtomicFMinFMaxF64FlatInsts())
1854 AtomicFMinFMax.legalFor({
F64, FlatPtr});
1858 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1859 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1860 {
S32, FlatPtr}, {
S64, FlatPtr}})
1861 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1862 {
S32, RegionPtr}, {
S64, RegionPtr}});
1866 getActionDefinitionsBuilder(G_SELECT)
1868 LocalPtr, FlatPtr, PrivatePtr,
1872 .clampScalar(0,
S16,
S64)
1876 .clampMaxNumElements(0,
S32, 2)
1877 .clampMaxNumElements(0, LocalPtr, 2)
1878 .clampMaxNumElements(0, PrivatePtr, 2)
1880 .widenScalarToNextPow2(0)
1885 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1887 if (
ST.has16BitInsts()) {
1888 if (
ST.hasVOP3PInsts()) {
1890 .clampMaxNumElements(0,
S16, 2);
1892 Shifts.legalFor({{
S16,
S16}});
1895 Shifts.widenScalarIf(
1900 const LLT AmountTy = Query.
Types[1];
1905 Shifts.clampScalar(1,
S32,
S32);
1906 Shifts.widenScalarToNextPow2(0, 16);
1907 Shifts.clampScalar(0,
S16,
S64);
1909 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1917 Shifts.clampScalar(1,
S32,
S32);
1918 Shifts.widenScalarToNextPow2(0, 32);
1919 Shifts.clampScalar(0,
S32,
S64);
1921 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1926 Shifts.scalarize(0);
1928 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1929 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1930 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1931 unsigned IdxTypeIdx = 2;
1933 getActionDefinitionsBuilder(
Op)
1935 const LLT EltTy = Query.
Types[EltTypeIdx];
1936 const LLT VecTy = Query.
Types[VecTypeIdx];
1937 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1939 const bool isLegalVecType =
1949 return (EltSize == 32 || EltSize == 64) &&
1965 const LLT EltTy = Query.
Types[EltTypeIdx];
1966 const LLT VecTy = Query.
Types[VecTypeIdx];
1970 const unsigned TargetEltSize =
1971 DstEltSize % 64 == 0 ? 64 : 32;
1972 return std::pair(VecTypeIdx,
1976 .clampScalar(EltTypeIdx,
S32,
S64)
1977 .clampScalar(VecTypeIdx,
S32,
S64)
1978 .clampScalar(IdxTypeIdx,
S32,
S32)
1979 .clampMaxNumElements(VecTypeIdx,
S32, 32)
1988 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1990 const LLT &EltTy = Query.
Types[1].getElementType();
1991 return Query.
Types[0] != EltTy;
1994 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1995 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1996 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1997 getActionDefinitionsBuilder(
Op)
2000 const LLT BigTy = Query.
Types[BigTyIdx];
2006 const LLT LitTy = Query.
Types[LitTyIdx];
2011 .widenScalarToNextPow2(BigTyIdx, 32)
2019 const LLT BigTy = Query.
Types[BigTyIdx];
2020 const LLT LitTy = Query.
Types[LitTyIdx];
2028 getActionDefinitionsBuilder(G_BUILD_VECTOR)
2037 if (
ST.hasScalarPackInsts()) {
2040 .minScalarOrElt(0,
S16)
2043 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2047 BuildVector.customFor({
V2S16,
S16});
2048 BuildVector.minScalarOrElt(0,
S32);
2050 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2058 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2060 .clampMaxNumElements(0,
S32, 32)
2061 .clampMaxNumElements(1,
S16, 2)
2062 .clampMaxNumElements(0,
S16, 64);
2064 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2067 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2068 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
2069 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
2071 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
2072 const LLT Ty = Query.
Types[TypeIdx];
2084 getActionDefinitionsBuilder(
Op)
2088 const LLT BigTy = Query.
Types[BigTyIdx];
2094 .widenScalarToNextPow2(LitTyIdx, 16)
2103 .clampScalar(LitTyIdx,
S32,
S512)
2104 .widenScalarToNextPow2(LitTyIdx, 32)
2108 return notValidElt(Query, LitTyIdx);
2113 return notValidElt(Query, BigTyIdx);
2118 if (
Op == G_MERGE_VALUES) {
2119 Builder.widenScalarIf(
2122 const LLT Ty = Query.
Types[LitTyIdx];
2128 Builder.widenScalarIf(
2130 const LLT Ty = Query.
Types[BigTyIdx];
2136 const LLT &Ty = Query.
Types[BigTyIdx];
2138 if (NewSizeInBits >= 256) {
2140 if (RoundedTo < NewSizeInBits)
2141 NewSizeInBits = RoundedTo;
2143 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2152 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2153 .legalFor({{
S32}, {
S64}})
2154 .clampScalar(0,
S32,
S64);
2156 if (
ST.hasVOP3PInsts()) {
2157 SextInReg.lowerFor({{
V2S16}})
2161 .clampMaxNumElementsStrict(0,
S16, 2);
2162 }
else if (
ST.has16BitInsts()) {
2163 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2167 SextInReg.lowerFor({{
S32}, {
S64}});
2172 .clampScalar(0,
S32,
S64)
2175 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2179 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2180 FSHRActionDefs.legalFor({{
S32,
S32}})
2181 .clampMaxNumElementsStrict(0,
S16, 2);
2182 if (
ST.hasVOP3PInsts())
2184 FSHRActionDefs.scalarize(0).lower();
2186 if (
ST.hasVOP3PInsts()) {
2187 getActionDefinitionsBuilder(G_FSHL)
2189 .clampMaxNumElementsStrict(0,
S16, 2)
2193 getActionDefinitionsBuilder(G_FSHL)
2198 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2201 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
2203 getActionDefinitionsBuilder(G_FENCE)
2206 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2211 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2213 .clampScalar(1,
S32,
S32)
2214 .clampScalar(0,
S32,
S64)
2215 .widenScalarToNextPow2(0)
2218 getActionDefinitionsBuilder(
2222 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2223 G_READ_REGISTER, G_WRITE_REGISTER,
2228 if (
ST.hasIEEEMinimumMaximumInsts()) {
2229 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2230 .legalFor(FPTypesPK16)
2231 .clampMaxNumElements(0,
S16, 2)
2233 }
else if (
ST.hasVOP3PInsts()) {
2234 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2236 .clampMaxNumElementsStrict(0,
S16, 2)
2240 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2242 .clampScalar(0,
S32,
S64)
2246 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2249 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2251 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2252 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2253 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2256 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2258 getActionDefinitionsBuilder(
2259 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2260 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2261 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2262 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2267 getActionDefinitionsBuilder({G_INTRINSIC, G_INTRINSIC_W_SIDE_EFFECTS,
2268 G_INTRINSIC_CONVERGENT,
2269 G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS})
2272 getLegacyLegalizerInfo().computeTables();
2282 switch (
MI.getOpcode()) {
2283 case TargetOpcode::G_ADDRSPACE_CAST:
2285 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2287 case TargetOpcode::G_FCEIL:
2289 case TargetOpcode::G_FREM:
2291 case TargetOpcode::G_INTRINSIC_TRUNC:
2293 case TargetOpcode::G_SITOFP:
2295 case TargetOpcode::G_UITOFP:
2297 case TargetOpcode::G_FPTOSI:
2299 case TargetOpcode::G_FPTOUI:
2301 case TargetOpcode::G_FMINNUM:
2302 case TargetOpcode::G_FMAXNUM:
2303 case TargetOpcode::G_FMINIMUMNUM:
2304 case TargetOpcode::G_FMAXIMUMNUM:
2306 case TargetOpcode::G_EXTRACT:
2308 case TargetOpcode::G_INSERT:
2310 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2312 case TargetOpcode::G_INSERT_VECTOR_ELT:
2314 case TargetOpcode::G_FSIN:
2315 case TargetOpcode::G_FCOS:
2317 case TargetOpcode::G_GLOBAL_VALUE:
2319 case TargetOpcode::G_LOAD:
2320 case TargetOpcode::G_SEXTLOAD:
2321 case TargetOpcode::G_ZEXTLOAD:
2323 case TargetOpcode::G_STORE:
2325 case TargetOpcode::G_FMAD:
2327 case TargetOpcode::G_FDIV:
2329 case TargetOpcode::G_FFREXP:
2331 case TargetOpcode::G_FSQRT:
2333 case TargetOpcode::G_UDIV:
2334 case TargetOpcode::G_UREM:
2335 case TargetOpcode::G_UDIVREM:
2337 case TargetOpcode::G_SDIV:
2338 case TargetOpcode::G_SREM:
2339 case TargetOpcode::G_SDIVREM:
2341 case TargetOpcode::G_ATOMIC_CMPXCHG:
2343 case TargetOpcode::G_FLOG2:
2345 case TargetOpcode::G_FLOG:
2346 case TargetOpcode::G_FLOG10:
2348 case TargetOpcode::G_FEXP2:
2350 case TargetOpcode::G_FEXP:
2351 case TargetOpcode::G_FEXP10:
2353 case TargetOpcode::G_FPOW:
2355 case TargetOpcode::G_FFLOOR:
2357 case TargetOpcode::G_BUILD_VECTOR:
2358 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2360 case TargetOpcode::G_MUL:
2362 case TargetOpcode::G_CTLZ:
2363 case TargetOpcode::G_CTTZ:
2365 case TargetOpcode::G_CTLS:
2367 case TargetOpcode::G_CTLZ_ZERO_POISON:
2369 case TargetOpcode::G_STACKSAVE:
2371 case TargetOpcode::G_GET_FPENV:
2373 case TargetOpcode::G_SET_FPENV:
2375 case TargetOpcode::G_TRAP:
2377 case TargetOpcode::G_DEBUGTRAP:
2397 if (ST.hasApertureRegs()) {
2402 ? AMDGPU::SRC_SHARED_BASE
2403 : AMDGPU::SRC_PRIVATE_BASE;
2404 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2405 !ST.hasGloballyAddressableScratch()) &&
2406 "Cannot use src_private_base with globally addressable scratch!");
2409 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2410 return B.buildUnmerge(
S32, Dst).getReg(1);
2425 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2441 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2444 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2466 B.buildObjectPtrOffset(
2468 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2469 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2477 switch (Def->getOpcode()) {
2478 case AMDGPU::G_FRAME_INDEX:
2479 case AMDGPU::G_GLOBAL_VALUE:
2480 case AMDGPU::G_BLOCK_ADDR:
2482 case AMDGPU::G_CONSTANT: {
2483 const ConstantInt *CI = Def->getOperand(1).getCImm();
2500 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2502 Intrinsic::amdgcn_addrspacecast_nonnull));
2507 :
MI.getOperand(1).getReg();
2511 unsigned SrcAS = SrcTy.getAddressSpace();
2521 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2528 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2530 ST.hasGloballyAddressableScratch()) {
2534 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2536 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2537 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2539 MRI.
setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2541 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2545 return B.buildExtract(Dst, Src, 0).getReg(0);
2551 castFlatToLocalOrPrivate(Dst);
2552 MI.eraseFromParent();
2558 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2559 auto FlatNull =
B.buildConstant(SrcTy, 0);
2562 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2566 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2568 MI.eraseFromParent();
2575 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2578 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2581 ST.hasGloballyAddressableScratch()) {
2586 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2590 if (ST.isWave64()) {
2591 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2597 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2598 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2600 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2604 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2605 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2607 MRI.
setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2608 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2617 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2623 castLocalOrPrivateToFlat(Dst);
2624 MI.eraseFromParent();
2628 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2635 SegmentNull.getReg(0));
2637 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2639 MI.eraseFromParent();
2644 SrcTy.getSizeInBits() == 64) {
2646 B.buildExtract(Dst, Src, 0);
2647 MI.eraseFromParent();
2654 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2655 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2656 if (AddrHiVal == 0) {
2658 B.buildIntToPtr(Dst, Zext);
2660 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2661 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2664 MI.eraseFromParent();
2671 MI.eraseFromParent();
2680 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2685 auto C1 =
B.buildFConstant(Ty, C1Val);
2686 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2689 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2690 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2692 auto C2 =
B.buildFConstant(Ty, C2Val);
2693 auto Fabs =
B.buildFAbs(Ty, Src);
2696 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2697 MI.eraseFromParent();
2715 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2717 const auto Zero =
B.buildFConstant(
S64, 0.0);
2718 const auto One =
B.buildFConstant(
S64, 1.0);
2721 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2722 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2725 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2726 MI.eraseFromParent();
2734 Register Src0Reg =
MI.getOperand(1).getReg();
2735 Register Src1Reg =
MI.getOperand(2).getReg();
2736 auto Flags =
MI.getFlags();
2739 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2740 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2741 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2742 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2743 MI.eraseFromParent();
2749 const unsigned FractBits = 52;
2750 const unsigned ExpBits = 11;
2753 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2754 auto Const1 =
B.buildConstant(
S32, ExpBits);
2756 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2758 .addUse(Const0.getReg(0))
2759 .addUse(Const1.getReg(0));
2761 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2775 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2782 const unsigned FractBits = 52;
2785 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2786 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2788 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2790 const auto Zero32 =
B.buildConstant(
S32, 0);
2793 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2795 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2796 auto Not =
B.buildNot(
S64, Shr);
2797 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2798 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2803 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2804 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2805 MI.eraseFromParent();
2821 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2822 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2825 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2826 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2828 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2829 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2832 B.buildFAdd(Dst, LdExp, CvtLo);
2833 MI.eraseFromParent();
2839 auto One =
B.buildConstant(
S32, 1);
2843 auto ThirtyOne =
B.buildConstant(
S32, 31);
2844 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2845 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2846 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2847 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2848 .addUse(Unmerge.getReg(1));
2849 auto LS2 =
B.buildSub(
S32, LS, One);
2850 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2852 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2853 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2854 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2855 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2856 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2857 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2858 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2859 B.buildFLdexp(Dst, FVal, Scale);
2860 MI.eraseFromParent();
2880 unsigned Flags =
MI.getFlags();
2891 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2899 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2900 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2904 K0 =
B.buildFConstant(
2906 K1 =
B.buildFConstant(
2909 K0 =
B.buildFConstant(
2911 K1 =
B.buildFConstant(
2915 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2916 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2917 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2920 :
B.buildFPTOUI(
S32, FloorMul);
2921 auto Lo =
B.buildFPTOUI(
S32, Fma);
2925 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2927 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2930 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2931 MI.eraseFromParent();
2963 unsigned StartIdx =
Offset / 32;
2965 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32), SrcReg);
2967 if (DstCount == 1) {
2969 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2974 for (
unsigned I = 0;
I < DstCount; ++
I)
2975 MergeVec.
push_back(Unmerge.getReg(StartIdx +
I));
2976 B.buildMergeLikeInstr(DstReg, MergeVec);
2979 MI.eraseFromParent();
2989 Register InsertSrc =
MI.getOperand(2).getReg();
2998 if (
Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
3002 unsigned DstCount = DstSize / 32;
3003 unsigned InsertCount = InsertSize / 32;
3004 unsigned StartIdx =
Offset / 32;
3006 auto SrcUnmerge =
B.buildUnmerge(
S32, SrcReg);
3009 for (
unsigned I = 0;
I < StartIdx; ++
I)
3012 if (InsertCount == 1) {
3016 InsertSrc =
B.buildPtrToInt(
S32, InsertSrc).getReg(0);
3019 auto InsertUnmerge =
B.buildUnmerge(
S32, InsertSrc);
3020 for (
unsigned I = 0;
I < InsertCount; ++
I)
3024 for (
unsigned I = StartIdx + InsertCount;
I < DstCount; ++
I)
3027 B.buildMergeLikeInstr(DstReg, MergeVec);
3029 MI.eraseFromParent();
3056 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
3057 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
3058 B.buildIntToPtr(Dst, IntElt);
3060 MI.eraseFromParent();
3067 std::optional<ValueAndVReg> MaybeIdxVal =
3071 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3074 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
3075 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3080 MI.eraseFromParent();
3109 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
3110 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
3111 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3113 B.buildIntToPtr(Dst, IntVecDest);
3114 MI.eraseFromParent();
3121 std::optional<ValueAndVReg> MaybeIdxVal =
3126 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3129 if (IdxVal < NumElts) {
3131 for (
unsigned i = 0; i < NumElts; ++i)
3133 B.buildUnmerge(SrcRegs, Vec);
3135 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
3136 B.buildMergeLikeInstr(Dst, SrcRegs);
3141 MI.eraseFromParent();
3152 unsigned Flags =
MI.getFlags();
3156 if (ST.hasTrigReducedRange()) {
3157 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3158 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3159 .addUse(MulVal.getReg(0))
3163 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3166 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3170 MI.eraseFromParent();
3178 unsigned GAFlags)
const {
3207 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3209 if (ST.has64BitLiterals()) {
3213 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3217 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3226 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3227 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3230 B.buildExtract(DstReg, PCReg, 0);
3240 if (RequiresHighHalf && ST.has64BitLiterals()) {
3242 MRI.
setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3243 B.buildInstr(AMDGPU::S_MOV_B64)
3258 MRI.
setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3261 B.buildInstr(AMDGPU::S_MOV_B32)
3266 if (RequiresHighHalf) {
3268 "Must provide a 64-bit pointer type!");
3271 MRI.
setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3273 B.buildInstr(AMDGPU::S_MOV_B32)
3284 MRI.
setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3286 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3290 if (AddrDst != DstReg)
3291 B.buildCast(DstReg, AddrDst);
3292 }
else if (AddrLo != DstReg) {
3295 B.buildCast(DstReg, AddrLo);
3304 unsigned AS = Ty.getAddressSpace();
3312 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3316 Fn,
"local memory global used by non-kernel function",
3325 B.buildUndef(DstReg);
3326 MI.eraseFromParent();
3350 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3351 B.buildIntToPtr(DstReg, Sz);
3352 MI.eraseFromParent();
3358 MI.eraseFromParent();
3362 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3364 MI.eraseFromParent();
3372 MI.eraseFromParent();
3378 MI.eraseFromParent();
3394 if (Ty.getSizeInBits() == 32) {
3396 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3397 B.buildExtract(DstReg, Load, 0);
3399 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3401 MI.eraseFromParent();
3424 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3426 MI.getOperand(1).setReg(Cast.getReg(0));
3431 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3457 if (WideMemSize == ValSize) {
3463 MI.setMemRefs(MF, {WideMMO});
3469 if (ValSize > WideMemSize)
3476 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3477 B.buildTrunc(ValReg, WideLoad).getReg(0);
3484 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3485 B.buildExtract(ValReg, WideLoad, 0);
3489 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3490 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3494 MI.eraseFromParent();
3507 Register DataReg =
MI.getOperand(0).getReg();
3552 "this should not have been custom lowered");
3557 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3559 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3563 .setMemRefs(
MI.memoperands());
3565 MI.eraseFromParent();
3573 switch (
DefMI->getOpcode()) {
3574 case TargetOpcode::G_INTRINSIC: {
3576 case Intrinsic::amdgcn_frexp_mant:
3577 case Intrinsic::amdgcn_log:
3578 case Intrinsic::amdgcn_log_clamp:
3579 case Intrinsic::amdgcn_exp2:
3580 case Intrinsic::amdgcn_sqrt:
3588 case TargetOpcode::G_FSQRT:
3590 case TargetOpcode::G_FFREXP: {
3591 if (
DefMI->getOperand(0).getReg() == Src)
3595 case TargetOpcode::G_FPEXT: {
3616std::pair<Register, Register>
3618 unsigned Flags)
const {
3623 auto SmallestNormal =
B.buildFConstant(
3625 auto IsLtSmallestNormal =
3628 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3629 auto One =
B.buildFConstant(
F32, 1.0);
3631 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3632 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3634 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3647 LLT Ty =
B.getMRI()->getType(Dst);
3648 unsigned Flags =
MI.getFlags();
3653 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3654 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3655 .addUse(Ext.getReg(0))
3657 B.buildFPTrunc(Dst,
Log2, Flags);
3658 MI.eraseFromParent();
3666 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3669 MI.eraseFromParent();
3673 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3674 .addUse(ScaledInput)
3677 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3678 auto Zero =
B.buildFConstant(Ty, 0.0);
3680 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3681 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3683 MI.eraseFromParent();
3689 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3690 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3695 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3696 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3701 unsigned Flags =
MI.getFlags();
3714 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3716 B.buildFPTrunc(Dst, LogVal);
3721 MI.eraseFromParent();
3730 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3733 if (ST.hasFastFMAF32()) {
3735 const float c_log10 = 0x1.344134p-2f;
3736 const float cc_log10 = 0x1.09f79ep-26f;
3739 const float c_log = 0x1.62e42ep-1f;
3740 const float cc_log = 0x1.efa39ep-25f;
3742 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3743 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3747 R =
B.buildFMul(Ty,
Y,
C, NewFlags).getReg(0);
3748 auto NegR =
B.buildFNeg(Ty, R, NewFlags);
3749 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, NewFlags);
3750 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, NewFlags);
3751 R =
B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3754 const float ch_log10 = 0x1.344000p-2f;
3755 const float ct_log10 = 0x1.3509f6p-18f;
3758 const float ch_log = 0x1.62e000p-1f;
3759 const float ct_log = 0x1.0bfbe8p-15f;
3761 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3762 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3764 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3765 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3766 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3770 auto YTCT =
B.buildFMul(Ty, YT, CT, NewFlags);
3773 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3775 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, NewFlags);
3778 const bool IsFiniteOnly =
3781 if (!IsFiniteOnly) {
3784 auto Fabs =
B.buildFAbs(Ty,
Y);
3787 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3791 auto Zero =
B.buildFConstant(Ty, 0.0);
3793 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3794 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3795 B.buildFSub(Dst, R, Shift, Flags);
3797 B.buildCopy(Dst, R);
3800 MI.eraseFromParent();
3806 unsigned Flags)
const {
3807 const double Log2BaseInverted =
3810 LLT Ty =
B.getMRI()->getType(Dst);
3815 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3818 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3819 auto Zero =
B.buildFConstant(Ty, 0.0);
3821 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3822 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3824 if (ST.hasFastFMAF32())
3825 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3827 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3828 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3836 ?
B.buildFLog2(Ty, Src, Flags)
3837 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3840 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3841 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3852 unsigned Flags =
MI.getFlags();
3853 LLT Ty =
B.getMRI()->getType(Dst);
3863 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3864 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3865 .addUse(Ext.getReg(0))
3867 B.buildFPTrunc(Dst,
Log2, Flags);
3868 MI.eraseFromParent();
3878 MI.eraseFromParent();
3886 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3888 RangeCheckConst, Flags);
3890 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3891 auto Zero =
B.buildFConstant(Ty, 0.0);
3892 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3893 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3895 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3896 .addUse(AddInput.getReg(0))
3899 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3900 auto One =
B.buildFConstant(Ty, 1.0);
3901 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3902 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3903 MI.eraseFromParent();
3908 const SrcOp &Src,
unsigned Flags) {
3909 LLT Ty = Dst.getLLTTy(*
B.getMRI());
3912 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3913 .addUse(Src.getReg())
3916 return B.buildFExp2(Dst, Src, Flags);
3922 bool IsExp10)
const {
3923 LLT Ty =
B.getMRI()->getType(
X);
3927 auto Const =
B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f :
numbers::log2e);
3928 auto Mul =
B.buildFMul(Ty,
X, Const, Flags);
3935 LLT Ty =
B.getMRI()->getType(Dst);
3942 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3945 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3946 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3947 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3950 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3952 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3953 .addUse(ExpInput.getReg(0))
3956 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3957 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3958 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3964 unsigned Flags)
const {
3965 LLT Ty =
B.getMRI()->getType(Dst);
3970 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3971 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3973 auto Mul1 =
B.buildFMul(Ty,
X, K1, Flags);
3974 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3975 auto Mul0 =
B.buildFMul(Ty,
X, K0, Flags);
3976 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3977 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3987 auto Threshold =
B.buildFConstant(Ty, -0x1.2f7030p+5f);
3991 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+5f);
3992 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3993 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X);
3995 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3996 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3998 auto Mul1 =
B.buildFMul(Ty, AdjustedX, K1, Flags);
3999 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
4000 auto Mul0 =
B.buildFMul(Ty, AdjustedX, K0, Flags);
4001 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
4003 auto MulExps =
B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
4004 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.9f623ep-107f);
4005 auto AdjustedResult =
B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
4007 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
4026 if (
MI.getOpcode() == TargetOpcode::G_FEXP2) {
4028 Dn =
B.buildFRint(
S64,
X, Flags).getReg(0);
4030 F =
B.buildFSub(
S64,
X, Dn, Flags).getReg(0);
4032 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4033 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4034 auto Mul2 =
B.buildFMul(
S64,
F, C2, Flags).getReg(0);
4035 T =
B.buildFMA(
S64,
F, C1, Mul2, Flags).getReg(0);
4037 }
else if (
MI.getOpcode() == TargetOpcode::G_FEXP10) {
4038 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.a934f0979a371p+1));
4039 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4040 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4042 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4043 auto C2 =
B.buildFConstant(
S64,
APFloat(-0x1.9dc1da994fd21p-59));
4044 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.34413509f79ffp-2));
4045 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4046 F =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4048 auto C4 =
B.buildFConstant(
S64,
APFloat(0x1.26bb1bbb55516p+1));
4049 auto C5 =
B.buildFConstant(
S64,
APFloat(-0x1.f48ad494ea3e9p-53));
4050 auto MulF =
B.buildFMul(
S64,
F, C5, Flags).getReg(0);
4051 T =
B.buildFMA(
S64,
F, C4, MulF, Flags).getReg(0);
4054 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.71547652b82fep+0));
4055 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4056 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4058 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4059 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4060 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4061 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4062 T =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4066 auto P =
B.buildFConstant(
S64, 0x1.ade156a5dcb37p-26);
4067 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.28af3fca7ab0cp-22),
4069 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.71dee623fde64p-19),
4071 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01997c89e6b0p-16),
4073 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01a014761f6ep-13),
4075 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.6c16c1852b7b0p-10),
4077 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.1111111122322p-7), Flags);
4078 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.55555555502a1p-5), Flags);
4079 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.5555555555511p-3), Flags);
4080 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.000000000000bp-1), Flags);
4082 auto One =
B.buildFConstant(
S64, 1.0);
4083 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4084 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4087 auto DnInt =
B.buildFPTOSI(
S32, Dn);
4088 auto Z =
B.buildFLdexp(
S64,
P, DnInt, Flags);
4095 Z =
B.buildSelect(
S64, CondHi, Z, PInf, Flags);
4102 B.buildSelect(
MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4104 MI.eraseFromParent();
4112 const unsigned Flags =
MI.getFlags();
4124 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
4132 MI.eraseFromParent();
4143 auto Ext =
B.buildFPExt(
F32,
X, Flags);
4146 B.buildFPTrunc(Dst, Lowered, Flags);
4147 MI.eraseFromParent();
4158 MI.eraseFromParent();
4186 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
4189 if (ST.hasFastFMAF32()) {
4191 const float cc_exp = 0x1.4ae0bep-26f;
4192 const float c_exp10 = 0x1.a934f0p+1f;
4193 const float cc_exp10 = 0x1.2f346ep-24f;
4195 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4196 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
4197 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
4198 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
4200 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4201 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
4203 const float ch_exp = 0x1.714000p+0f;
4204 const float cl_exp = 0x1.47652ap-12f;
4206 const float ch_exp10 = 0x1.a92000p+1f;
4207 const float cl_exp10 = 0x1.4f0978p-11f;
4209 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
4210 auto XH =
B.buildAnd(Ty,
X, MaskConst);
4211 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
4213 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4214 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
4216 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4217 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
4220 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
4221 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4224 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
4227 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
4228 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
4231 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4232 .addUse(
A.getReg(0))
4234 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
4236 auto UnderflowCheckConst =
4237 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4238 auto Zero =
B.buildFConstant(Ty, 0.0);
4242 R =
B.buildSelect(Ty, Underflow, Zero, R);
4245 auto OverflowCheckConst =
4246 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4251 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
4254 B.buildCopy(Dst, R);
4255 MI.eraseFromParent();
4264 unsigned Flags =
MI.getFlags();
4265 LLT Ty =
B.getMRI()->getType(Dst);
4270 auto Log =
B.buildFLog2(
F32, Src0, Flags);
4271 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4272 .addUse(Log.getReg(0))
4275 B.buildFExp2(Dst,
Mul, Flags);
4276 }
else if (Ty == F16) {
4278 auto Log =
B.buildFLog2(F16, Src0, Flags);
4279 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
4280 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
4281 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4282 .addUse(Ext0.getReg(0))
4283 .addUse(Ext1.getReg(0))
4285 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
4289 MI.eraseFromParent();
4297 ModSrc = SrcFNeg->getOperand(1).getReg();
4299 ModSrc = SrcFAbs->getOperand(1).getReg();
4301 ModSrc = SrcFAbs->getOperand(1).getReg();
4312 Register OrigSrc =
MI.getOperand(1).getReg();
4313 unsigned Flags =
MI.getFlags();
4315 "this should not have been custom lowered");
4325 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
4345 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4347 B.buildFMinNum(Min, Fract, Const, Flags);
4352 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
4355 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
4356 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4358 MI.eraseFromParent();
4374 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4376 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4377 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4380 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4381 B.buildBitcast(Dst,
Merge);
4383 MI.eraseFromParent();
4400 bool UsePartialMad64_32,
4401 bool SeparateOddAlignedProducts)
const {
4416 auto getZero32 = [&]() ->
Register {
4418 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4421 auto getZero64 = [&]() ->
Register {
4423 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4428 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4439 if (CarryIn.empty())
4442 bool HaveCarryOut =
true;
4444 if (CarryIn.size() == 1) {
4446 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4450 CarryAccum = getZero32();
4452 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4453 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4455 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4460 LocalAccum = getZero32();
4461 HaveCarryOut =
false;
4466 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4467 LocalAccum =
Add.getReg(0);
4481 auto buildMadChain =
4484 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4485 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4492 if (LocalAccum.size() == 1 &&
4493 (!UsePartialMad64_32 || !CarryIn.empty())) {
4496 unsigned j1 = DstIndex - j0;
4497 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4501 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4503 LocalAccum[0] =
Mul.getReg(0);
4505 if (CarryIn.empty()) {
4506 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4509 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4515 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4519 if (j0 <= DstIndex) {
4520 bool HaveSmallAccum =
false;
4523 if (LocalAccum[0]) {
4524 if (LocalAccum.size() == 1) {
4525 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4526 HaveSmallAccum =
true;
4527 }
else if (LocalAccum[1]) {
4528 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4529 HaveSmallAccum =
false;
4531 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4532 HaveSmallAccum =
true;
4535 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4537 HaveSmallAccum =
true;
4541 unsigned j1 = DstIndex - j0;
4542 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4546 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4547 {Src0[j0], Src1[j1], Tmp});
4548 Tmp = Mad.getReg(0);
4549 if (!HaveSmallAccum)
4550 CarryOut.push_back(Mad.getReg(1));
4551 HaveSmallAccum =
false;
4554 }
while (j0 <= DstIndex);
4556 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4557 LocalAccum[0] = Unmerge.getReg(0);
4558 if (LocalAccum.size() > 1)
4559 LocalAccum[1] = Unmerge.getReg(1);
4586 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4587 Carry OddCarryIn = std::move(OddCarry);
4588 Carry EvenCarryIn = std::move(EvenCarry);
4593 if (2 * i < Accum.
size()) {
4594 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4595 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4600 if (!SeparateOddAlignedProducts) {
4601 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4602 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4604 bool IsHighest = 2 * i >= Accum.
size();
4607 .take_front(IsHighest ? 1 : 2);
4608 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4614 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4616 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4618 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4621 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4624 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4625 Lo->getOperand(1).getReg());
4626 Accum[2 * i] =
Hi.getReg(0);
4627 SeparateOddCarry =
Hi.getReg(1);
4634 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4635 EvenCarryIn.push_back(CarryOut);
4637 if (2 * i < Accum.
size()) {
4638 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4639 OddCarry.push_back(CarryOut);
4651 assert(ST.hasMad64_32());
4652 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4664 unsigned Size = Ty.getSizeInBits();
4665 if (ST.hasVMulU64Inst() &&
Size == 64)
4668 unsigned NumParts =
Size / 32;
4680 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4684 for (
unsigned i = 0; i < NumParts; ++i) {
4688 B.buildUnmerge(Src0Parts, Src0);
4689 B.buildUnmerge(Src1Parts, Src1);
4692 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4693 SeparateOddAlignedProducts);
4695 B.buildMergeLikeInstr(DstReg, AccumRegs);
4696 MI.eraseFromParent();
4711 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4712 ? AMDGPU::G_AMDGPU_FFBH_U32
4713 : AMDGPU::G_AMDGPU_FFBL_B32;
4714 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4717 MI.eraseFromParent();
4727 TypeSize NumBits = SrcTy.getSizeInBits();
4731 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4732 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4733 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4734 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4735 B.buildTrunc(Dst, Ctlz);
4736 MI.eraseFromParent();
4747 assert(SrcTy ==
S32 &&
"legalizeCTLS only supports s32");
4748 unsigned BitWidth = SrcTy.getSizeInBits();
4750 auto Sffbh =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32}).addUse(Src);
4752 B.buildSub(Dst, Clamped,
B.buildConstant(
S32, 1));
4753 MI.eraseFromParent();
4759 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4762 return ConstVal == -1;
4769 Register CondDef =
MI.getOperand(0).getReg();
4788 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4797 UncondBrTarget = &*NextMBB;
4799 if (
Next->getOpcode() != AMDGPU::G_BR)
4818 *ArgRC,
B.getDebugLoc(), ArgTy);
4822 const unsigned Mask = Arg->
getMask();
4830 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4831 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4834 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4836 B.buildCopy(DstReg, LiveIn);
4846 if (!ST.hasClusters()) {
4849 MI.eraseFromParent();
4869 auto One =
B.buildConstant(
S32, 1);
4870 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4871 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4872 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4879 B.buildCopy(DstReg, GlobalIdXYZ);
4880 MI.eraseFromParent();
4884 B.buildCopy(DstReg, ClusterIdXYZ);
4885 MI.eraseFromParent();
4890 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4892 MRI.
setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4893 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4895 .addImm(ClusterIdField);
4896 auto Zero =
B.buildConstant(
S32, 0);
4899 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4900 MI.eraseFromParent();
4942 auto LoadConstant = [&](
unsigned N) {
4943 B.buildConstant(DstReg,
N);
4947 if (ST.hasArchitectedSGPRs() &&
4954 Arg = &WorkGroupIDX;
4955 ArgRC = &AMDGPU::SReg_32RegClass;
4959 Arg = &WorkGroupIDY;
4960 ArgRC = &AMDGPU::SReg_32RegClass;
4964 Arg = &WorkGroupIDZ;
4965 ArgRC = &AMDGPU::SReg_32RegClass;
4969 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4970 return LoadConstant(0);
4971 Arg = &ClusterWorkGroupIDX;
4972 ArgRC = &AMDGPU::SReg_32RegClass;
4976 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
4977 return LoadConstant(0);
4978 Arg = &ClusterWorkGroupIDY;
4979 ArgRC = &AMDGPU::SReg_32RegClass;
4983 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
4984 return LoadConstant(0);
4985 Arg = &ClusterWorkGroupIDZ;
4986 ArgRC = &AMDGPU::SReg_32RegClass;
4991 return LoadConstant(ClusterDims.
getDims()[0] - 1);
4992 Arg = &ClusterWorkGroupMaxIDX;
4993 ArgRC = &AMDGPU::SReg_32RegClass;
4998 return LoadConstant(ClusterDims.
getDims()[1] - 1);
4999 Arg = &ClusterWorkGroupMaxIDY;
5000 ArgRC = &AMDGPU::SReg_32RegClass;
5005 return LoadConstant(ClusterDims.
getDims()[2] - 1);
5006 Arg = &ClusterWorkGroupMaxIDZ;
5007 ArgRC = &AMDGPU::SReg_32RegClass;
5011 Arg = &ClusterWorkGroupMaxFlatID;
5012 ArgRC = &AMDGPU::SReg_32RegClass;
5027 return LoadConstant(0);
5032 B.buildUndef(DstReg);
5036 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5048 MI.eraseFromParent();
5054 B.buildConstant(
MI.getOperand(0).getReg(),
C);
5055 MI.eraseFromParent();
5062 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
5076 B.buildUndef(DstReg);
5077 MI.eraseFromParent();
5081 if (Arg->isMasked()) {
5095 MI.eraseFromParent();
5110 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
5119 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5127 Align Alignment)
const {
5131 "unexpected kernarg parameter type");
5138 MI.eraseFromParent();
5173 auto FloatY =
B.buildUITOFP(
S32,
Y);
5174 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
5176 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
5177 auto Z =
B.buildFPTOUI(
S32, ScaledY);
5180 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
5181 auto NegYZ =
B.buildMul(
S32, NegY, Z);
5182 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
5185 auto Q =
B.buildUMulH(
S32,
X, Z);
5186 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
5189 auto One =
B.buildConstant(
S32, 1);
5192 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
5198 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
5201 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
5220 auto Unmerge =
B.buildUnmerge(
S32, Val);
5222 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
5223 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
5225 auto Mad =
B.buildFMAD(
5229 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
5230 auto Mul1 =
B.buildFMul(
5234 auto Mul2 =
B.buildFMul(
5236 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
5239 auto Mad2 =
B.buildFMAD(
5243 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
5244 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
5246 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5261 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
5263 auto Zero64 =
B.buildConstant(
S64, 0);
5264 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
5266 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
5267 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
5269 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
5270 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5271 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5273 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
5274 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5275 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
5277 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
5278 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
5279 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
5280 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5281 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5283 auto Zero32 =
B.buildConstant(
S32, 0);
5284 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
5285 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5286 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
5288 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
5289 Register NumerLo = UnmergeNumer.getReg(0);
5290 Register NumerHi = UnmergeNumer.getReg(1);
5292 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
5293 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
5294 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
5295 Register Mul3_Lo = UnmergeMul3.getReg(0);
5296 Register Mul3_Hi = UnmergeMul3.getReg(1);
5297 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
5298 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5299 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
5300 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
5302 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
5303 Register DenomLo = UnmergeDenom.getReg(0);
5304 Register DenomHi = UnmergeDenom.getReg(1);
5307 auto C1 =
B.buildSExt(
S32, CmpHi);
5310 auto C2 =
B.buildSExt(
S32, CmpLo);
5313 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
5320 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
5321 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5322 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5323 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
5325 auto One64 =
B.buildConstant(
S64, 1);
5326 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
5332 auto C6 =
B.buildSelect(
5336 auto Add4 =
B.buildAdd(
S64, Add3, One64);
5337 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
5339 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5340 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5341 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
5347 auto Sel1 =
B.buildSelect(
5354 auto Sel2 =
B.buildSelect(
5365 switch (
MI.getOpcode()) {
5368 case AMDGPU::G_UDIV: {
5369 DstDivReg =
MI.getOperand(0).getReg();
5372 case AMDGPU::G_UREM: {
5373 DstRemReg =
MI.getOperand(0).getReg();
5376 case AMDGPU::G_UDIVREM: {
5377 DstDivReg =
MI.getOperand(0).getReg();
5378 DstRemReg =
MI.getOperand(1).getReg();
5385 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5386 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
5387 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5397 MI.eraseFromParent();
5408 if (Ty !=
S32 && Ty !=
S64)
5411 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5412 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5413 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5415 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5416 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5417 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5419 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5420 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5422 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5423 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5425 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5426 switch (
MI.getOpcode()) {
5429 case AMDGPU::G_SDIV: {
5430 DstDivReg =
MI.getOperand(0).getReg();
5434 case AMDGPU::G_SREM: {
5435 DstRemReg =
MI.getOperand(0).getReg();
5439 case AMDGPU::G_SDIVREM: {
5440 DstDivReg =
MI.getOperand(0).getReg();
5441 DstRemReg =
MI.getOperand(1).getReg();
5454 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5455 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5456 B.buildSub(DstDivReg, SignXor, Sign);
5460 auto Sign = LHSign.getReg(0);
5461 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5462 B.buildSub(DstRemReg, SignXor, Sign);
5465 MI.eraseFromParent();
5481 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5492 if (CLHS->isExactlyValue(1.0)) {
5493 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5497 MI.eraseFromParent();
5502 if (CLHS->isExactlyValue(-1.0)) {
5503 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5504 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5505 .addUse(FNeg.getReg(0))
5508 MI.eraseFromParent();
5515 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5520 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5523 B.buildFMul(Res, LHS, RCP, Flags);
5525 MI.eraseFromParent();
5540 if (!AllowInaccurateRcp)
5548 X =
B.buildFConstant(ResTy, 1.0).getReg(0);
5550 Register NegY = IsNegRcp ?
Y :
B.buildFNeg(ResTy,
Y).getReg(0);
5551 auto One =
B.buildFConstant(ResTy, 1.0);
5553 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5557 R =
B.buildFNeg(ResTy, R);
5559 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5560 R =
B.buildFMA(ResTy, Tmp0, R, R);
5562 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5563 R =
B.buildFMA(ResTy, Tmp1, R, R);
5567 B.buildCopy(Res, R);
5568 MI.eraseFromParent();
5572 auto Ret =
B.buildFMul(ResTy,
X, R);
5573 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5575 B.buildFMA(Res, Tmp2, R, Ret);
5576 MI.eraseFromParent();
5608 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5609 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5610 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5611 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5612 .addUse(RHSExt.getReg(0))
5614 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5616 if (ST.hasMadMacF32Insts()) {
5617 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5618 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5619 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5621 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5622 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5623 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5625 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5626 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5627 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5628 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5629 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5630 .addUse(RDst.getReg(0))
5635 MI.eraseFromParent();
5648 unsigned SPDenormMode =
5651 if (ST.hasDenormModeInst()) {
5653 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5655 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5656 B.buildInstr(AMDGPU::S_DENORM_MODE)
5657 .addImm(NewDenormModeValue);
5660 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5661 .addImm(SPDenormMode)
5683 auto One =
B.buildFConstant(
S32, 1.0f);
5685 auto DenominatorScaled =
5686 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5691 auto NumeratorScaled =
5692 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5698 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5699 .addUse(DenominatorScaled.getReg(0))
5701 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5704 const bool HasDynamicDenormals =
5709 if (!PreservesDenormals) {
5710 if (HasDynamicDenormals) {
5712 B.buildInstr(AMDGPU::S_GETREG_B32)
5713 .addDef(SavedSPDenormMode)
5719 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5720 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5721 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5722 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5723 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5724 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5726 if (!PreservesDenormals) {
5727 if (HasDynamicDenormals) {
5728 assert(SavedSPDenormMode);
5729 B.buildInstr(AMDGPU::S_SETREG_B32)
5730 .addReg(SavedSPDenormMode)
5736 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5737 .addUse(Fma4.getReg(0))
5738 .addUse(Fma1.getReg(0))
5739 .addUse(Fma3.getReg(0))
5740 .addUse(NumeratorScaled.getReg(1))
5743 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5744 .addUse(Fmas.getReg(0))
5749 MI.eraseFromParent();
5768 auto One =
B.buildFConstant(
S64, 1.0);
5770 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5776 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5778 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5779 .addUse(DivScale0.getReg(0))
5782 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5783 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5784 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5786 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5792 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5793 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5794 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5797 if (!ST.hasUsableDivScaleConditionOutput()) {
5803 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5804 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5805 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5806 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5809 Scale1Unmerge.getReg(1));
5811 Scale0Unmerge.getReg(1));
5812 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5814 Scale = DivScale1.getReg(1);
5817 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5818 .addUse(Fma4.getReg(0))
5819 .addUse(Fma3.getReg(0))
5820 .addUse(
Mul.getReg(0))
5824 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5825 .addUse(Fmas.getReg(0))
5830 MI.eraseFromParent();
5845 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5848 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5852 if (ST.hasFractBug()) {
5853 auto Fabs =
B.buildFAbs(Ty, Val);
5857 auto Zero =
B.buildConstant(InstrExpTy, 0);
5858 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5859 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5862 B.buildCopy(Res0, Mant);
5863 B.buildSExtOrTrunc(Res1, Exp);
5865 MI.eraseFromParent();
5880 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5883 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5884 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5885 auto C2 =
B.buildFConstant(
S32, 1.0f);
5888 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5890 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5892 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5893 .addUse(Mul0.getReg(0))
5896 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5898 B.buildFMul(Res, Sel, Mul1, Flags);
5900 MI.eraseFromParent();
5909 unsigned Flags =
MI.getFlags();
5910 assert(!ST.has16BitInsts());
5912 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5913 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5914 .addUse(Ext.getReg(0))
5916 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5917 MI.eraseFromParent();
5927 const unsigned Flags =
MI.getFlags();
5936 MI.eraseFromParent();
5940 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5942 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5943 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5944 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5949 .addUse(SqrtX.getReg(0))
5952 auto NegOne =
B.buildConstant(I32, -1);
5953 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5955 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5956 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5958 auto PosOne =
B.buildConstant(I32, 1);
5959 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5961 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5962 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5964 auto Zero =
B.buildFConstant(
F32, 0.0f);
5968 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5972 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5975 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5976 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5978 auto Half =
B.buildFConstant(
F32, 0.5f);
5979 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5980 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5981 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5982 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5983 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5984 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5985 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5986 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5989 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5991 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5993 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5996 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5998 MI.eraseFromParent();
6033 unsigned Flags =
MI.getFlags();
6038 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
6040 ZeroInt =
B.buildConstant(
S32, 0).getReg(0);
6044 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
6045 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
6046 SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags).getReg(0);
6049 auto SqrtY =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX);
6051 auto Half =
B.buildFConstant(
F64, 0.5);
6052 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
6053 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
6055 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
6056 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
6058 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
6059 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
6061 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
6062 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
6064 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
6066 Register SqrtRet = SqrtS2.getReg(0);
6068 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
6069 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
6070 auto SqrtD2 =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
6073 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
6074 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
6075 SqrtRet =
B.buildFLdexp(
F64, SqrtD2, ScaleDown, Flags).getReg(0);
6080 auto ZeroFP =
B.buildFConstant(
F64, 0.0);
6089 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6091 MI.eraseFromParent();
6122 auto Flags =
MI.getFlags();
6134 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6144 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6145 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6150 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6152 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6153 MI.eraseFromParent();
6165 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6166 IID == Intrinsic::amdgcn_permlanex16;
6167 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6168 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6169 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
6170 IID == Intrinsic::amdgcn_permlane_up ||
6171 IID == Intrinsic::amdgcn_permlane_down ||
6172 IID == Intrinsic::amdgcn_permlane_xor;
6176 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
6178 case Intrinsic::amdgcn_readfirstlane:
6179 case Intrinsic::amdgcn_permlane64:
6180 return LaneOp.getReg(0);
6181 case Intrinsic::amdgcn_readlane:
6182 case Intrinsic::amdgcn_set_inactive:
6183 case Intrinsic::amdgcn_set_inactive_chain_arg:
6184 return LaneOp.addUse(Src1).getReg(0);
6185 case Intrinsic::amdgcn_writelane:
6186 case Intrinsic::amdgcn_permlane_bcast:
6187 case Intrinsic::amdgcn_permlane_up:
6188 case Intrinsic::amdgcn_permlane_down:
6189 case Intrinsic::amdgcn_permlane_xor:
6190 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6191 case Intrinsic::amdgcn_permlane16:
6192 case Intrinsic::amdgcn_permlanex16: {
6194 int64_t Src4 =
MI.getOperand(6).getImm();
6195 int64_t Src5 =
MI.getOperand(7).getImm();
6196 return LaneOp.addUse(Src1)
6203 case Intrinsic::amdgcn_mov_dpp8:
6204 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
6205 case Intrinsic::amdgcn_update_dpp:
6206 return LaneOp.addUse(Src1)
6207 .addImm(
MI.getOperand(4).getImm())
6208 .addImm(
MI.getOperand(5).getImm())
6209 .addImm(
MI.getOperand(6).getImm())
6210 .addImm(
MI.getOperand(7).getImm())
6220 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6221 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
6222 IsPermlaneShuffle) {
6223 Src1 =
MI.getOperand(3).getReg();
6224 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ||
6225 IsPermlaneShuffle) {
6226 Src2 =
MI.getOperand(4).getReg();
6231 unsigned Size = Ty.getSizeInBits();
6233 unsigned SplitSize = 32;
6234 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
6235 ST.hasDPALU_DPP() &&
6239 if (
Size == SplitSize) {
6245 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
6247 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6250 if (IID == Intrinsic::amdgcn_writelane)
6253 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
6254 B.buildTrunc(DstReg, LaneOpDst);
6255 MI.eraseFromParent();
6259 if (
Size % SplitSize != 0)
6263 bool NeedsBitcast =
false;
6264 if (Ty.isVector()) {
6267 if (EltSize == SplitSize) {
6268 PartialResTy = EltTy;
6269 }
else if (EltSize == 16 || EltSize == 32) {
6270 unsigned NElem = SplitSize / EltSize;
6274 NeedsBitcast =
true;
6279 unsigned NumParts =
Size / SplitSize;
6283 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6284 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
6286 if (IID == Intrinsic::amdgcn_writelane)
6287 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
6289 for (
unsigned i = 0; i < NumParts; ++i) {
6290 Src0 = Src0Parts.
getReg(i);
6292 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6293 Src1 = Src1Parts.
getReg(i);
6295 if (IID == Intrinsic::amdgcn_writelane)
6296 Src2 = Src2Parts.
getReg(i);
6298 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6302 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
6305 B.buildMergeLikeInstr(DstReg, PartialRes);
6307 MI.eraseFromParent();
6315 ST.getTargetLowering()->getImplicitParameterOffset(
6325 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6326 B.buildConstant(IdxTy,
Offset).getReg(0));
6337 Register Pointer =
MI.getOperand(2).getReg();
6339 Register NumRecords =
MI.getOperand(4).getReg();
6345 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6347 auto ExtStride =
B.buildAnyExt(
S32, Stride);
6349 if (ST.has45BitNumRecordsBufferResource()) {
6354 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
6355 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
6356 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
6357 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
6361 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
6362 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
6363 auto ExtShiftedStride =
6364 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
6365 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
6366 auto ExtShiftedFlags =
6367 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
6368 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
6370 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
6371 B.buildMergeValues(Result, {LowHalf, HighHalf});
6373 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
6374 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
6375 auto LowHalf = Unmerge.getReg(0);
6376 auto HighHalf = Unmerge.getReg(1);
6378 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
6379 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
6380 auto ShiftConst =
B.buildConstant(
S32, 16);
6381 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
6382 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
6383 Register NewHighHalfReg = NewHighHalf.getReg(0);
6384 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6387 MI.eraseFromParent();
6404 MI.eraseFromParent();
6412 std::optional<uint32_t> KnownSize =
6414 if (KnownSize.has_value())
6415 B.buildConstant(DstReg, *KnownSize);
6433 MI.eraseFromParent();
6440 unsigned AddrSpace)
const {
6442 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6446 ST.hasGloballyAddressableScratch()) {
6448 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6449 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6451 MRI.
setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6453 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6455 B.buildConstant(
S32, 1u << 26));
6460 MI.eraseFromParent();
6470std::pair<Register, unsigned>
6482 bool CheckNUW = ST.hasGFX1250Insts();
6484 MRI, OrigOffset,
nullptr, CheckNUW);
6488 BaseReg =
B.buildPtrToInt(MRI.
getType(OrigOffset), BaseReg).getReg(0);
6498 unsigned Overflow = ImmOffset & ~MaxImm;
6499 ImmOffset -= Overflow;
6500 if ((int32_t)Overflow < 0) {
6501 Overflow += ImmOffset;
6505 if (Overflow != 0) {
6507 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6509 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6510 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6515 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6517 return std::pair(BaseReg, ImmOffset);
6524 bool ImageStore)
const {
6530 if (ST.hasUnpackedD16VMem()) {
6531 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6534 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6535 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6543 if (ImageStore && ST.hasImageStoreD16Bug()) {
6546 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6548 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6555 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6556 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6558 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6566 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6567 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6569 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6586 bool IsFormat)
const {
6598 VData =
B.buildBitcast(Ty, VData).getReg(0);
6606 if (Ty.isVector()) {
6607 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6619 bool IsFormat)
const {
6626 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6641 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6644 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6648 VIndex =
MI.getOperand(3).getReg();
6651 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6654 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6655 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6659 Format =
MI.getOperand(5 + OpOffset).getImm();
6663 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6669 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6670 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6671 }
else if (IsFormat) {
6672 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6673 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6677 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6680 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6683 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6688 auto MIB =
B.buildInstr(
Opc)
6699 MIB.addImm(AuxiliaryData)
6700 .addImm(HasVIndex ? -1 : 0)
6701 .addMemOperand(MMO);
6703 MI.eraseFromParent();
6709 unsigned ImmOffset,
unsigned Format,
6712 auto MIB =
B.buildInstr(
Opc)
6723 MIB.addImm(AuxiliaryData)
6724 .addImm(HasVIndex ? -1 : 0)
6725 .addMemOperand(MMO);
6731 bool IsTyped)
const {
6745 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6746 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6748 StatusDst =
MI.getOperand(1).getReg();
6753 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6756 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6759 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6762 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6765 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6768 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6769 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6773 Format =
MI.getOperand(5 + OpOffset).getImm();
6777 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6787 Dst =
MI.getOperand(0).getReg();
6788 B.setInsertPt(
B.getMBB(),
MI);
6795 Dst =
MI.getOperand(0).getReg();
6796 B.setInsertPt(
B.getMBB(),
MI);
6800 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6801 const bool Unpacked = ST.hasUnpackedD16VMem();
6811 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6812 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6813 }
else if (IsFormat) {
6817 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6819 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6820 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6825 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6826 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6829 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6830 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6833 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6834 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6840 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6841 unsigned NumLoadDWords = NumValueDWords + 1;
6843 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6845 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6847 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6848 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6849 B.buildTrunc(Dst, ExtDst);
6850 }
else if (NumValueDWords == 1) {
6851 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6854 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6855 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6857 B.buildUnmerge(LoadElts, LoadDstReg);
6859 B.buildMergeLikeInstr(Dst, LoadElts);
6862 (IsD16 && !Ty.isVector())) {
6863 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6865 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6866 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6867 B.buildTrunc(Dst, LoadDstReg);
6868 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6870 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6872 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6873 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6875 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6877 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6878 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6879 B.buildMergeLikeInstr(Dst, Repack);
6882 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6885 MI.eraseFromParent();
6891 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6892 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6893 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6896 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6898 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6899 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6901 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6903 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6904 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6905 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6906 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6908 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6911 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6913 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6916 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6918 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6921 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6923 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6926 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6928 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6931 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6932 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6933 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6935 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6936 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6937 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6938 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6939 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6940 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6941 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6942 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6943 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6945 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6946 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6947 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6948 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6949 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6950 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6951 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6952 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6953 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6954 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6955 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6956 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6957 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6958 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6960 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6961 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6962 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6963 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6964 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6965 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6966 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6967 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6968 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6969 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6970 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6971 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6972 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6973 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6974 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6975 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6976 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6977 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6978 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6979 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6980 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6989 const bool IsCmpSwap =
6990 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6991 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6992 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6993 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
7004 CmpVal =
MI.getOperand(3).getReg();
7009 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
7010 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
7013 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
7016 VIndex =
MI.getOperand(4 + OpOffset).getReg();
7019 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
7022 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
7023 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
7024 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
7043 .addImm(AuxiliaryData)
7044 .addImm(HasVIndex ? -1 : 0)
7045 .addMemOperand(MMO);
7047 MI.eraseFromParent();
7057 bool IsA16,
bool IsG16) {
7073 (
B.getMRI()->getType(AddrReg) ==
S16)) {
7078 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7082 "Bias needs to be converted to 16 bit in A16 mode");
7084 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
7090 if (((
I + 1) >= EndIdx) ||
7097 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
7099 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7104 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7115 int DimIdx,
int NumVAddrs) {
7119 for (
int I = 0;
I != NumVAddrs; ++
I) {
7121 if (
SrcOp.isReg()) {
7127 int NumAddrRegs = AddrRegs.
size();
7128 if (NumAddrRegs != 1) {
7131 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7134 for (
int I = 1;
I != NumVAddrs; ++
I) {
7137 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
7159 const unsigned NumDefs =
MI.getNumExplicitDefs();
7160 const unsigned ArgOffset = NumDefs + 1;
7161 bool IsTFE = NumDefs == 2;
7179 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7183 const bool IsAtomicPacked16Bit =
7184 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7185 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7193 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
7194 const bool IsA16 = AddrTy ==
S16;
7195 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
7198 if (!BaseOpcode->
Atomic) {
7199 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
7202 }
else if (DMask != 0) {
7204 }
else if (!IsTFE && !BaseOpcode->
Store) {
7206 B.buildUndef(
MI.getOperand(0));
7207 MI.eraseFromParent();
7215 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7216 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7217 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7218 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7219 unsigned NewOpcode = LoadOpcode;
7220 if (BaseOpcode->
Store)
7221 NewOpcode = StoreOpcode;
7223 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7226 MI.setDesc(
B.getTII().get(NewOpcode));
7230 if (IsTFE && DMask == 0) {
7233 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
7236 if (BaseOpcode->
Atomic) {
7241 if (Ty.isVector() && !IsAtomicPacked16Bit)
7248 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
7249 MI.getOperand(2).setReg(
Concat.getReg(0));
7250 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7254 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
7257 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7263 if (IsA16 && !ST.hasA16()) {
7268 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
7269 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7271 if (IsA16 || IsG16) {
7279 const bool UseNSA = ST.hasNSAEncoding() &&
7280 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
7281 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
7282 const bool UsePartialNSA =
7283 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
7285 if (UsePartialNSA) {
7289 auto Concat =
B.buildConcatVectors(
7290 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7291 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
7292 PackedRegs.
resize(NSAMaxSize);
7293 }
else if (!UseNSA && PackedRegs.
size() > 1) {
7295 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
7296 PackedRegs[0] =
Concat.getReg(0);
7300 const unsigned NumPacked = PackedRegs.
size();
7303 if (!
SrcOp.isReg()) {
7313 SrcOp.setReg(AMDGPU::NoRegister);
7330 const bool UseNSA = ST.hasNSAEncoding() &&
7331 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7332 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7333 const bool UsePartialNSA =
7334 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7336 if (UsePartialNSA) {
7338 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
7340 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
7355 if (!Ty.isVector() || !IsD16)
7359 if (RepackedReg != VData) {
7360 MI.getOperand(1).setReg(RepackedReg);
7368 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7371 if (NumElts < DMaskLanes)
7374 if (NumElts > 4 || DMaskLanes > 4)
7384 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7385 const LLT AdjustedTy =
7401 if (IsD16 && ST.hasUnpackedD16VMem()) {
7408 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
7409 unsigned RoundedSize = 32 * RoundedElts;
7413 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
7418 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
7424 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
7428 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7429 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
7433 MI.getOperand(0).setReg(NewResultReg);
7441 Dst1Reg =
MI.getOperand(1).getReg();
7446 MI.removeOperand(1);
7450 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7459 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7461 if (ResultNumRegs == 1) {
7463 ResultRegs[0] = NewResultReg;
7466 for (
int I = 0;
I != NumDataRegs; ++
I)
7468 B.buildUnmerge(ResultRegs, NewResultReg);
7473 ResultRegs.
resize(NumDataRegs);
7478 if (IsD16 && !Ty.isVector()) {
7479 B.buildTrunc(DstReg, ResultRegs[0]);
7484 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7485 B.buildBitcast(DstReg, ResultRegs[0]);
7497 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7499 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7500 }
else if (ST.hasUnpackedD16VMem()) {
7502 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7506 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7510 for (
int I = 0;
I != NumElts; ++
I)
7517 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7518 B.buildBuildVector(DstReg, ResultRegs);
7522 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7523 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7529 if (ResultRegs.
size() == 1) {
7530 NewResultReg = ResultRegs[0];
7531 }
else if (ResultRegs.
size() == 2) {
7533 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7541 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7543 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7548 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7549 B.buildConcatVectors(DstReg, ResultRegs);
7558 Register OrigDst =
MI.getOperand(0).getReg();
7560 LLT Ty =
B.getMRI()->getType(OrigDst);
7561 unsigned Size = Ty.getSizeInBits();
7564 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7566 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7567 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7570 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7572 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7581 B.setInsertPt(
B.getMBB(),
MI);
7586 B.setInsertPt(
B.getMBB(),
MI);
7592 MI.setDesc(
B.getTII().get(
Opc));
7593 MI.removeOperand(1);
7596 const unsigned MemSize = (
Size + 7) / 8;
7597 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7604 MI.addMemOperand(MF, MMO);
7605 if (Dst != OrigDst) {
7606 MI.getOperand(0).setReg(Dst);
7607 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7608 B.buildTrunc(OrigDst, Dst);
7630 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7631 MI.removeOperand(0);
7641 if (!ST.hasTrapHandler() ||
7645 return ST.supportsGetDoorbellID() ?
7658 MI.eraseFromParent();
7668 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7670 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7674 MI.eraseFromParent();
7683 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7690 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7710 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7713 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7714 B.buildCopy(SGPR01, Temp);
7715 B.buildInstr(AMDGPU::S_TRAP)
7718 MI.eraseFromParent();
7729 B.buildCopy(SGPR01, LiveIn);
7730 B.buildInstr(AMDGPU::S_TRAP)
7734 MI.eraseFromParent();
7743 if (ST.hasPrivEnabledTrap2NopBug()) {
7744 ST.getInstrInfo()->insertSimulatedTrap(MRI,
B.getMBB(),
MI,
7746 MI.eraseFromParent();
7750 B.buildInstr(AMDGPU::S_TRAP)
7752 MI.eraseFromParent();
7761 if (!ST.hasTrapHandler() ||
7765 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7768 B.buildInstr(AMDGPU::S_TRAP)
7772 MI.eraseFromParent();
7785 Register NodePtr =
MI.getOperand(2).getReg();
7786 Register RayExtent =
MI.getOperand(3).getReg();
7787 Register RayOrigin =
MI.getOperand(4).getReg();
7789 Register RayInvDir =
MI.getOperand(6).getReg();
7792 if (!ST.hasGFX10_AEncoding()) {
7795 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7804 const unsigned NumVDataDwords = 4;
7805 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7806 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7808 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7810 const unsigned BaseOpcodes[2][2] = {
7811 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7812 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7813 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7817 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7818 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7819 : AMDGPU::MIMGEncGfx10NSA,
7820 NumVDataDwords, NumVAddrDwords);
7824 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7825 : AMDGPU::MIMGEncGfx10Default,
7826 NumVDataDwords, NumVAddrDwords);
7831 if (UseNSA && IsGFX11Plus) {
7833 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7834 auto Merged =
B.buildMergeLikeInstr(
7835 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7836 Ops.push_back(Merged.getReg(0));
7839 Ops.push_back(NodePtr);
7840 Ops.push_back(RayExtent);
7841 packLanes(RayOrigin);
7844 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7845 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7846 auto MergedDir =
B.buildMergeLikeInstr(
7849 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7850 UnmergeRayDir.getReg(0)}))
7853 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7854 UnmergeRayDir.getReg(1)}))
7857 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7858 UnmergeRayDir.getReg(2)}))
7860 Ops.push_back(MergedDir.getReg(0));
7863 packLanes(RayInvDir);
7867 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7868 Ops.push_back(Unmerge.getReg(0));
7869 Ops.push_back(Unmerge.getReg(1));
7871 Ops.push_back(NodePtr);
7873 Ops.push_back(RayExtent);
7876 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7877 Ops.push_back(Unmerge.getReg(0));
7878 Ops.push_back(Unmerge.getReg(1));
7879 Ops.push_back(Unmerge.getReg(2));
7882 packLanes(RayOrigin);
7884 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7885 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7889 B.buildMergeLikeInstr(R1,
7890 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7891 B.buildMergeLikeInstr(
7892 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7893 B.buildMergeLikeInstr(
7894 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7900 packLanes(RayInvDir);
7907 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7909 Ops.push_back(MergedOps);
7912 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7921 .addImm(IsA16 ? 1 : 0)
7924 MI.eraseFromParent();
7934 Register DstOrigin =
MI.getOperand(1).getReg();
7936 Register NodePtr =
MI.getOperand(4).getReg();
7937 Register RayExtent =
MI.getOperand(5).getReg();
7938 Register InstanceMask =
MI.getOperand(6).getReg();
7939 Register RayOrigin =
MI.getOperand(7).getReg();
7941 Register Offsets =
MI.getOperand(9).getReg();
7942 Register TDescr =
MI.getOperand(10).getReg();
7944 if (!ST.hasBVHDualAndBVH8Insts()) {
7947 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7952 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7953 const unsigned NumVDataDwords = 10;
7954 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7956 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7957 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7958 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7961 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7962 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7964 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7965 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7971 .addUse(RayExtentInstanceMaskVec.getReg(0))
7978 MI.eraseFromParent();
7987 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7988 MI.eraseFromParent();
7995 if (!ST.hasArchitectedSGPRs())
7999 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
8000 auto LSB =
B.buildConstant(
S32, 25);
8001 auto Width =
B.buildConstant(
S32, 5);
8002 B.buildUbfx(DstReg, TTMP8, LSB, Width);
8003 MI.eraseFromParent();
8011 unsigned Width)
const {
8015 MRI.
setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
8016 B.buildInstr(AMDGPU::S_GETREG_B32_const)
8019 MI.eraseFromParent();
8037 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
8041 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
8044 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
8045 MI.eraseFromParent();
8056 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
8060 .addReg(Unmerge.getReg(0));
8064 .addReg(Unmerge.getReg(1));
8065 MI.eraseFromParent();
8077 case Intrinsic::amdgcn_icmp: {
8088 if (!Src1Const || Src1Const->Value != 0)
8092 int64_t Pred =
MI.getOperand(4).getImm();
8098 B.buildIntrinsic(Intrinsic::amdgcn_ballot, Dst).addUse(Src0);
8099 MI.eraseFromParent();
8102 case Intrinsic::sponentry:
8108 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8111 B.buildIntToPtr(DstReg, TmpReg);
8112 MI.eraseFromParent();
8114 int FI =
B.getMF().getFrameInfo().CreateFixedObject(
8116 B.buildFrameIndex(
MI.getOperand(0), FI);
8117 MI.eraseFromParent();
8120 case Intrinsic::amdgcn_if:
8121 case Intrinsic::amdgcn_else: {
8124 bool Negated =
false;
8136 std::swap(CondBrTarget, UncondBrTarget);
8138 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8139 if (IntrID == Intrinsic::amdgcn_if) {
8140 B.buildInstr(AMDGPU::SI_IF)
8143 .addMBB(UncondBrTarget);
8145 B.buildInstr(AMDGPU::SI_ELSE)
8148 .addMBB(UncondBrTarget);
8157 B.buildBr(*CondBrTarget);
8162 MI.eraseFromParent();
8163 BrCond->eraseFromParent();
8169 case Intrinsic::amdgcn_loop: {
8172 bool Negated =
false;
8182 std::swap(CondBrTarget, UncondBrTarget);
8184 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8185 B.buildInstr(AMDGPU::SI_LOOP)
8187 .addMBB(UncondBrTarget);
8192 B.buildBr(*CondBrTarget);
8194 MI.eraseFromParent();
8195 BrCond->eraseFromParent();
8202 case Intrinsic::amdgcn_wave_reduce_min:
8203 case Intrinsic::amdgcn_wave_reduce_umin:
8204 case Intrinsic::amdgcn_wave_reduce_max:
8205 case Intrinsic::amdgcn_wave_reduce_umax:
8206 case Intrinsic::amdgcn_wave_reduce_add:
8207 case Intrinsic::amdgcn_wave_reduce_sub:
8208 case Intrinsic::amdgcn_wave_reduce_and:
8209 case Intrinsic::amdgcn_wave_reduce_or:
8210 case Intrinsic::amdgcn_wave_reduce_xor: {
8215 bool NeedsSignExt = IntrID == Intrinsic::amdgcn_wave_reduce_min ||
8216 IntrID == Intrinsic::amdgcn_wave_reduce_max ||
8217 IntrID == Intrinsic::amdgcn_wave_reduce_add ||
8218 IntrID == Intrinsic::amdgcn_wave_reduce_sub;
8219 auto Ext = NeedsSignExt ?
B.buildSExt(
LLT::scalar(32), SrcReg)
8224 .addUse(Ext.getReg(0))
8225 .addImm(
MI.getOperand(3).getImm());
8226 B.buildTrunc(DstReg, NewDst);
8227 MI.eraseFromParent();
8230 case Intrinsic::amdgcn_addrspacecast_nonnull:
8232 case Intrinsic::amdgcn_make_buffer_rsrc:
8234 case Intrinsic::amdgcn_kernarg_segment_ptr:
8237 B.buildConstant(
MI.getOperand(0).getReg(), 0);
8238 MI.eraseFromParent();
8244 case Intrinsic::amdgcn_implicitarg_ptr:
8246 case Intrinsic::amdgcn_workitem_id_x:
8249 case Intrinsic::amdgcn_workitem_id_y:
8252 case Intrinsic::amdgcn_workitem_id_z:
8255 case Intrinsic::amdgcn_workgroup_id_x:
8260 case Intrinsic::amdgcn_workgroup_id_y:
8265 case Intrinsic::amdgcn_workgroup_id_z:
8270 case Intrinsic::amdgcn_cluster_id_x:
8271 return ST.hasClusters() &&
8274 case Intrinsic::amdgcn_cluster_id_y:
8275 return ST.hasClusters() &&
8278 case Intrinsic::amdgcn_cluster_id_z:
8279 return ST.hasClusters() &&
8282 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8283 return ST.hasClusters() &&
8286 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8287 return ST.hasClusters() &&
8290 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8291 return ST.hasClusters() &&
8294 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8295 return ST.hasClusters() &&
8297 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8298 return ST.hasClusters() &&
8301 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8302 return ST.hasClusters() &&
8305 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8306 return ST.hasClusters() &&
8309 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8310 return ST.hasClusters() &&
8314 case Intrinsic::amdgcn_wave_id:
8316 case Intrinsic::amdgcn_lds_kernel_id:
8319 case Intrinsic::amdgcn_dispatch_ptr:
8322 case Intrinsic::amdgcn_queue_ptr:
8325 case Intrinsic::amdgcn_implicit_buffer_ptr:
8328 case Intrinsic::amdgcn_dispatch_id:
8331 case Intrinsic::r600_read_ngroups_x:
8335 case Intrinsic::r600_read_ngroups_y:
8338 case Intrinsic::r600_read_ngroups_z:
8341 case Intrinsic::r600_read_local_size_x:
8344 case Intrinsic::r600_read_local_size_y:
8348 case Intrinsic::r600_read_local_size_z:
8351 case Intrinsic::amdgcn_fdiv_fast:
8353 case Intrinsic::amdgcn_is_shared:
8355 case Intrinsic::amdgcn_is_private:
8357 case Intrinsic::amdgcn_wavefrontsize: {
8358 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
8359 MI.eraseFromParent();
8362 case Intrinsic::amdgcn_s_buffer_load:
8364 case Intrinsic::amdgcn_raw_buffer_store:
8365 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8366 case Intrinsic::amdgcn_struct_buffer_store:
8367 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8369 case Intrinsic::amdgcn_raw_buffer_store_format:
8370 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8371 case Intrinsic::amdgcn_struct_buffer_store_format:
8372 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8374 case Intrinsic::amdgcn_raw_tbuffer_store:
8375 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8376 case Intrinsic::amdgcn_struct_tbuffer_store:
8377 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8379 case Intrinsic::amdgcn_raw_buffer_load:
8380 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8381 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8382 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8383 case Intrinsic::amdgcn_struct_buffer_load:
8384 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8385 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8386 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8388 case Intrinsic::amdgcn_raw_buffer_load_format:
8389 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8390 case Intrinsic::amdgcn_struct_buffer_load_format:
8391 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8393 case Intrinsic::amdgcn_raw_tbuffer_load:
8394 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8395 case Intrinsic::amdgcn_struct_tbuffer_load:
8396 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8398 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8399 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8400 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8401 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8402 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8403 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8404 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8405 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8406 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8407 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8408 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8409 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8410 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8411 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8412 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8413 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8414 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8415 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8416 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8417 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8418 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8419 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8420 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8421 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8422 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8423 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8424 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8425 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8426 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8427 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8428 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8429 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8430 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8431 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8432 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8433 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8434 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8435 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8436 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8437 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8438 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8439 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8440 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8441 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8442 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8443 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8444 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8445 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8446 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8447 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8448 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8449 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8450 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8451 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8452 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8453 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8454 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8455 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8456 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8457 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8458 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8459 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8460 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8461 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8462 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8463 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8464 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8465 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8466 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8467 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8468 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8469 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8471 case Intrinsic::amdgcn_rsq_clamp:
8473 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8475 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8476 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8478 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8479 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8480 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8481 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8482 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8483 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8484 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8485 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8489 if (IndexArgTy !=
S64) {
8490 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(
S64, Index)
8491 :
B.buildAnyExt(
S64, Index);
8492 MI.getOperand(5).setReg(NewIndex.getReg(0));
8496 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8497 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8498 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8499 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8500 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8501 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8502 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8503 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8507 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
8510 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8511 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8512 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8513 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8514 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8515 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8516 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8517 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8518 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8520 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8524 if (IndexArgTy != IdxTy) {
8525 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(IdxTy, Index)
8526 :
B.buildAnyExt(IdxTy, Index);
8527 MI.getOperand(7).setReg(NewIndex.getReg(0));
8532 case Intrinsic::amdgcn_fmed3: {
8538 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8539 MI.removeOperand(1);
8543 case Intrinsic::amdgcn_readlane:
8544 case Intrinsic::amdgcn_writelane:
8545 case Intrinsic::amdgcn_readfirstlane:
8546 case Intrinsic::amdgcn_permlane16:
8547 case Intrinsic::amdgcn_permlanex16:
8548 case Intrinsic::amdgcn_permlane64:
8549 case Intrinsic::amdgcn_set_inactive:
8550 case Intrinsic::amdgcn_set_inactive_chain_arg:
8551 case Intrinsic::amdgcn_mov_dpp8:
8552 case Intrinsic::amdgcn_update_dpp:
8553 case Intrinsic::amdgcn_permlane_bcast:
8554 case Intrinsic::amdgcn_permlane_up:
8555 case Intrinsic::amdgcn_permlane_down:
8556 case Intrinsic::amdgcn_permlane_xor:
8558 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8560 case Intrinsic::amdgcn_dead: {
8564 MI.eraseFromParent();
8567 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8568 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8569 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8570 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8571 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8572 MI.eraseFromParent();
8574 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8575 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8576 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8577 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8578 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8579 MI.eraseFromParent();
8581 case Intrinsic::amdgcn_av_load_b128:
8582 case Intrinsic::amdgcn_av_store_b128: {
8584 if (!ST.hasFlatGlobalInsts()) {
8585 const char *Name = IntrID == Intrinsic::amdgcn_av_load_b128
8586 ?
"llvm.amdgcn.av.load.b128"
8587 :
"llvm.amdgcn.av.store.b128";
8590 Fn,
Twine(Name) +
" not supported on subtarget",
MI.getDebugLoc()));
8593 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8594 if (IntrID == Intrinsic::amdgcn_av_load_b128)
8595 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8597 B.buildStore(
MI.getOperand(2),
MI.getOperand(1),
8598 **
MI.memoperands_begin());
8599 MI.eraseFromParent();
8602 case Intrinsic::amdgcn_flat_load_monitor_b32:
8603 case Intrinsic::amdgcn_flat_load_monitor_b64:
8604 case Intrinsic::amdgcn_flat_load_monitor_b128:
8605 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8606 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8607 .add(
MI.getOperand(0))
8608 .add(
MI.getOperand(2))
8609 .addMemOperand(*
MI.memoperands_begin());
8610 MI.eraseFromParent();
8612 case Intrinsic::amdgcn_global_load_monitor_b32:
8613 case Intrinsic::amdgcn_global_load_monitor_b64:
8614 case Intrinsic::amdgcn_global_load_monitor_b128:
8615 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8616 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8617 .add(
MI.getOperand(0))
8618 .add(
MI.getOperand(2))
8619 .addMemOperand(*
MI.memoperands_begin());
8620 MI.eraseFromParent();
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
bool isModuleEntryFunction() const
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isBottomOfStack() const
bool isEntryFunction() const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
Get the array size.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
ConstantFP - Floating Point Values [float, double].
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr bool isAnyScalar() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty if condition is met.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.