25#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "AMDGPUGenCallingConv.inc"
35 "amdgpu-bypass-slow-div",
36 cl::desc(
"Skip 64-bit divide for dynamic 32-bit values"),
45 if (StoreSize % 32 == 0)
201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
401 {MVT::f16, MVT::f32, MVT::f64},
Expand);
430 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
431 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
432 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
438 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
445 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
446 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
447 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
448 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
449 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
454 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
455 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
456 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
457 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
458 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
459 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
460 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
466 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
467 for (
MVT VT : ScalarIntVTs) {
506 for (
auto VT : {MVT::i8, MVT::i16})
510 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
511 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
513 for (
MVT VT : VectorIntTypes) {
531 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
532 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
534 for (
MVT VT : FloatVectorTypes) {
639 const auto Flags =
Op.getNode()->getFlags();
640 if (Flags.hasNoSignedZeros())
689 unsigned Opc =
N->getOpcode();
710 return (
N->getNumOperands() > 2 &&
N->getOpcode() !=
ISD::SELECT) ||
719 return N->getValueType(0) == MVT::f32;
726 if (isa<MemSDNode>(
N))
729 switch (
N->getOpcode()) {
744 switch (
N->getConstantOperandVal(0)) {
745 case Intrinsic::amdgcn_interp_p1:
746 case Intrinsic::amdgcn_interp_p2:
747 case Intrinsic::amdgcn_interp_mov:
748 case Intrinsic::amdgcn_interp_p1_f16:
749 case Intrinsic::amdgcn_interp_p2_f16:
769 unsigned NumMayIncreaseSize = 0;
770 MVT VT =
N->getValueType(0).getScalarType().getSimpleVT();
775 for (
const SDNode *U :
N->users()) {
810 bool ForCodeSize)
const {
812 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
819 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
836 EVT OldVT =
N->getValueType(0);
844 if (OldSize >= 32 && NewSize < 32 && MN->
getAlign() >=
Align(4) &&
859 return (OldSize < 32);
874 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
879 CastTy, MMO, &
Fast) &&
895 switch (
N->getOpcode()) {
900 unsigned IntrID =
N->getConstantOperandVal(0);
904 unsigned IntrID =
N->getConstantOperandVal(1);
908 if (cast<LoadSDNode>(
N)->getMemOperand()->getAddrSpace() ==
922 switch (
Op.getOpcode()) {
932 EVT VT =
Op.getValueType();
957 return VT == MVT::f32 || VT == MVT::f64 ||
958 (Subtarget->
has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
965 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
989 unsigned SrcSize = Source.getSizeInBits();
992 return DestSize < SrcSize && DestSize % 32 == 0 ;
998 unsigned SrcSize = Source->getScalarSizeInBits();
1002 return SrcSize >= 32;
1004 return DestSize < SrcSize && DestSize % 32 == 0;
1008 unsigned SrcSize = Src->getScalarSizeInBits();
1012 return DestSize >= 32;
1014 return SrcSize == 32 && DestSize == 64;
1023 if (Src == MVT::i16)
1024 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1026 return Src == MVT::i32 && Dest == MVT::i64;
1031 switch (
N->getOpcode()) {
1046 if (!
N->isDivergent() && DestVT.
isInteger() &&
1064 if (isa<LoadSDNode>(
N))
1074 "Expected shift op");
1076 SDValue ShiftLHS =
N->getOperand(0);
1091 if (
N->getValueType(0) == MVT::i32 &&
N->hasOneUse() &&
1092 (
N->user_begin()->getOpcode() ==
ISD::SRA ||
1093 N->user_begin()->getOpcode() ==
ISD::SRL))
1100 auto *RHSLd = dyn_cast<LoadSDNode>(
RHS);
1101 auto *LHS0 = dyn_cast<LoadSDNode>(
LHS.getOperand(0));
1102 auto *LHS1 = dyn_cast<ConstantSDNode>(
LHS.getOperand(1));
1103 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() ==
ISD::ZEXTLOAD &&
1104 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1109 return !(IsShiftAndLoad(
LHS,
RHS) || IsShiftAndLoad(
RHS,
LHS));
1129 return CC_AMDGPU_CS_CHAIN;
1133 return CC_AMDGPU_Func;
1158 return RetCC_SI_Shader;
1160 return RetCC_SI_Gfx;
1164 return RetCC_AMDGPU_Func;
1203 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1210 unsigned InIndex = 0;
1213 const bool IsByRef = Arg.hasByRefAttr();
1214 Type *BaseArgTy = Arg.getType();
1215 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1216 Align Alignment =
DL.getValueOrABITypeAlignment(
1217 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1218 MaxAlign = std::max(Alignment, MaxAlign);
1219 uint64_t AllocSize =
DL.getTypeAllocSize(MemArgTy);
1221 uint64_t ArgOffset =
alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1222 ExplicitArgOffset =
alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1235 for (
unsigned Value = 0, NumValues = ValueVTs.
size();
1273 }
else if (RegisterVT.
isVector()) {
1276 assert(MemoryBits % NumElements == 0);
1280 MemoryBits / NumElements);
1298 unsigned PartOffset = 0;
1299 for (
unsigned i = 0; i != NumRegs; ++i) {
1301 BasePartOffset + PartOffset,
1340 int ClobberedFI)
const {
1343 int64_t LastByte = FirstByte + MFI.
getObjectSize(ClobberedFI) - 1;
1352 if (
LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1354 if (FI->getIndex() < 0) {
1356 int64_t InLastByte = InFirstByte;
1359 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1360 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1382 FuncName =
G->getSymbol();
1384 FuncName =
G->getGlobal()->getName();
1416 switch (
Op.getOpcode()) {
1420 "instruction is not implemented yet!");
1465 switch (
N->getOpcode()) {
1511 if (std::optional<uint32_t>
Address =
1520 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
1525 Fn,
"local memory global used by non-kernel function",
1543 "Do not know what to do with an non-zero offset");
1559 EVT VT =
Op.getValueType();
1561 unsigned OpBitSize =
Op.getOperand(0).getValueType().getSizeInBits();
1562 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1563 unsigned NewNumElt = OpBitSize / 32;
1564 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1566 MVT::i32, NewNumElt);
1567 for (
const SDUse &U :
Op->ops()) {
1573 Args.push_back(NewIn);
1583 for (
const SDUse &U :
Op->ops())
1593 unsigned Start =
Op.getConstantOperandVal(1);
1594 EVT VT =
Op.getValueType();
1595 EVT SrcVT =
Op.getOperand(0).getValueType();
1600 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 &&
"expect legal types");
1604 EVT NewVT = NumElt == 2
1716 if ((
LHS == True &&
RHS == False) || (
LHS == False &&
RHS == True))
1735 if (
LHS == NegTrue && CFalse && CRHS) {
1749std::pair<SDValue, SDValue>
1761 return std::pair(
Lo,
Hi);
1790 HiVT = NumElts - LoNumElts == 1
1793 return std::pair(LoVT, HiVT);
1798std::pair<SDValue, SDValue>
1800 const EVT &LoVT,
const EVT &HiVT,
1804 N.getValueType().getVectorNumElements() &&
1805 "More vector elements requested than available!");
1811 return std::pair(
Lo,
Hi);
1817 EVT VT =
Op.getValueType();
1829 SDValue BasePtr = Load->getBasePtr();
1830 EVT MemVT = Load->getMemoryVT();
1835 EVT LoMemVT, HiMemVT;
1843 Align BaseAlign = Load->getAlign();
1847 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1848 BaseAlign, Load->getMemOperand()->getFlags());
1851 DAG.
getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1853 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1877 EVT VT =
Op.getValueType();
1878 SDValue BasePtr = Load->getBasePtr();
1879 EVT MemVT = Load->getMemoryVT();
1882 Align BaseAlign = Load->getAlign();
1887 if (NumElements != 3 ||
1888 (BaseAlign <
Align(8) &&
1892 assert(NumElements == 3);
1899 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1900 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1911 SDValue Val = Store->getValue();
1919 EVT MemVT = Store->getMemoryVT();
1920 SDValue Chain = Store->getChain();
1921 SDValue BasePtr = Store->getBasePtr();
1925 EVT LoMemVT, HiMemVT;
1935 Align BaseAlign = Store->getAlign();
1940 DAG.
getTruncStore(Chain, SL,
Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1941 Store->getMemOperand()->getFlags());
1944 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1955 EVT VT =
Op.getValueType();
1958 MVT IntVT = MVT::i32;
1959 MVT FltVT = MVT::f32;
1962 if (LHSSignBits < 9)
1966 if (RHSSignBits < 9)
1970 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1971 unsigned DivBits = BitSize - SignBits;
2015 bool UseFmadFtz =
false;
2016 if (Subtarget->
isGCN()) {
2071 EVT VT =
Op.getValueType();
2073 assert(VT == MVT::i64 &&
"LowerUDIVREM64 expects an i64");
2146 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2159 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2173 std::tie(Mul3_Lo, Mul3_Hi) = DAG.
SplitScalar(Mul3,
DL, HalfVT, HalfVT);
2252 for (
unsigned i = 0; i < halfBitWidth; ++i) {
2253 const unsigned bitPos = halfBitWidth - i - 1;
2284 EVT VT =
Op.getValueType();
2286 if (VT == MVT::i64) {
2292 if (VT == MVT::i32) {
2339 EVT VT =
Op.getValueType();
2347 if (VT == MVT::i32) {
2352 if (VT == MVT::i64 &&
2399 EVT VT =
Op.getValueType();
2400 auto Flags =
Op->getFlags();
2438 const unsigned FractBits = 52;
2439 const unsigned ExpBits = 11;
2455 assert(
Op.getValueType() == MVT::f64);
2465 const unsigned FractBits = 52;
2477 = DAG.
getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2502 assert(
Op.getValueType() == MVT::f64);
2535 auto VT =
Op.getValueType();
2536 auto Arg =
Op.getOperand(0u);
2548 EVT VT =
Op.getValueType();
2599 switch (Src.getOpcode()) {
2601 return Src.getOperand(0).getValueType() == MVT::f16;
2606 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2607 switch (IntrinsicID) {
2608 case Intrinsic::amdgcn_frexp_mant:
2623 if (Flags.hasApproximateFuncs())
2642 EVT VT = Src.getValueType();
2653 return IsLtSmallestNormal;
2659 EVT VT = Src.getValueType();
2672std::pair<SDValue, SDValue>
2693 return {ScaledInput, IsLtSmallestNormal};
2704 EVT VT =
Op.getValueType();
2708 if (VT == MVT::f16) {
2717 auto [ScaledInput, IsLtSmallestNormal] =
2740 EVT VT =
Op.getValueType();
2748 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2774 const float c_log10 = 0x1.344134p-2f;
2775 const float cc_log10 = 0x1.09f79ep-26f;
2778 const float c_log = 0x1.62e42ep-1f;
2779 const float cc_log = 0x1.efa39ep-25f;
2791 const float ch_log10 = 0x1.344000p-2f;
2792 const float ct_log10 = 0x1.3509f6p-18f;
2795 const float ch_log = 0x1.62e000p-1f;
2796 const float ct_log = 0x1.0bfbe8p-15f;
2813 const bool IsFiniteOnly = (Flags.hasNoNaNs() ||
Options.NoNaNsFPMath) &&
2814 (Flags.hasNoInfs() ||
Options.NoInfsFPMath);
2817 if (!IsFiniteOnly) {
2843 EVT VT = Src.getValueType();
2847 double Log2BaseInverted =
2850 if (VT == MVT::f32) {
2860 ScaledResultOffset, Zero, Flags);
2875 return DAG.
getNode(
ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2884 EVT VT =
Op.getValueType();
2888 if (VT == MVT::f16) {
2933 EVT VT =
X.getValueType();
2941 SL, VT,
Mul, Flags);
2973 const EVT VT =
X.getValueType();
3023 EVT VT =
Op.getValueType();
3086 const float cc_exp = 0x1.4ae0bep-26f;
3087 const float c_exp10 = 0x1.a934f0p+1f;
3088 const float cc_exp10 = 0x1.2f346ep-24f;
3098 const float ch_exp = 0x1.714000p+0f;
3099 const float cl_exp = 0x1.47652ap-12f;
3101 const float ch_exp10 = 0x1.a92000p+1f;
3102 const float cl_exp10 = 0x1.4f0978p-11f;
3117 PL =
getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3132 DAG.
getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3142 if (!Flags.hasNoInfs() && !
Options.NoInfsFPMath) {
3144 DAG.
getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3166 auto Opc =
Op.getOpcode();
3167 auto Arg =
Op.getOperand(0u);
3168 auto ResultVT =
Op.getValueType();
3170 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3174 assert(ResultVT == Arg.getValueType());
3176 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3183 NewOp = DAG.
getNode(Opc, SL, MVT::i32, NewOp);
3186 NewOp = DAG.
getNode(Opc, SL, MVT::i32, NewOp);
3203 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3205 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3219 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3239 OprLo = DAG.
getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3241 OprHi = DAG.
getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3358 if (Subtarget->
isGCN())
3401 EVT DestVT =
Op.getValueType();
3403 EVT SrcVT = Src.getValueType();
3405 if (SrcVT == MVT::i16) {
3406 if (DestVT == MVT::f16)
3415 if (DestVT == MVT::bf16) {
3422 if (SrcVT != MVT::i64)
3437 if (DestVT == MVT::f32)
3440 assert(DestVT == MVT::f64);
3446 EVT DestVT =
Op.getValueType();
3449 EVT SrcVT = Src.getValueType();
3451 if (SrcVT == MVT::i16) {
3452 if (DestVT == MVT::f16)
3461 if (DestVT == MVT::bf16) {
3468 if (SrcVT != MVT::i64)
3486 if (DestVT == MVT::f32)
3489 assert(DestVT == MVT::f64);
3498 EVT SrcVT = Src.getValueType();
3500 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3513 if (
Signed && SrcVT == MVT::f32) {
3526 if (SrcVT == MVT::f64) {
3528 llvm::bit_cast<double>(UINT64_C( 0x3df0000000000000)), SL,
3531 llvm::bit_cast<double>(UINT64_C( 0xc1f0000000000000)), SL,
3535 llvm::bit_cast<float>(UINT32_C( 0x2f800000)), SL, SrcVT);
3537 llvm::bit_cast<float>(UINT32_C( 0xcf800000)), SL, SrcVT);
3548 SL, MVT::i32, FloorMul);
3554 if (
Signed && SrcVT == MVT::f32) {
3584 const unsigned ExpMask = 0x7ff;
3585 const unsigned ExpBiasf64 = 1023;
3586 const unsigned ExpBiasf16 = 15;
3670 unsigned OpOpcode =
Op.getOpcode();
3671 EVT SrcVT = Src.getValueType();
3672 EVT DestVT =
Op.getValueType();
3675 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3678 if (SrcVT == MVT::bf16) {
3681 return DAG.
getNode(
Op.getOpcode(),
DL, DestVT, PromotedSrc);
3685 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3692 if (DestVT != MVT::i64)
3695 if (SrcVT == MVT::f16 ||
3702 return DAG.
getNode(Ext,
DL, MVT::i64, FpToInt32);
3705 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3713 EVT ExtraVT = cast<VTSDNode>(
Op.getOperand(1))->getVT();
3714 MVT VT =
Op.getSimpleValueType();
3728 for (
unsigned I = 0;
I < NElts; ++
I)
3743 EVT VT =
Op.getValueType();
3757 unsigned NewOpcode = Node24->
getOpcode();
3761 case Intrinsic::amdgcn_mul_i24:
3764 case Intrinsic::amdgcn_mul_u24:
3767 case Intrinsic::amdgcn_mulhi_i24:
3770 case Intrinsic::amdgcn_mulhi_u24:
3785 if (DemandedLHS || DemandedRHS)
3787 DemandedLHS ? DemandedLHS :
LHS,
3788 DemandedRHS ? DemandedRHS :
RHS);
3800template <
typename IntTy>
3803 if (Width +
Offset < 32) {
3805 IntTy Result =
static_cast<IntTy
>(Shl) >> (32 - Width);
3806 if constexpr (std::is_signed_v<IntTy>) {
3818 if (
MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3819 if (M->isVolatile())