24#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "AMDGPUGenCallingConv.inc"
35 "amdgpu-bypass-slow-div",
36 cl::desc(
"Skip 64-bit divide for dynamic 32-bit values"),
45 if (StoreSize % 32 == 0)
201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
401 {MVT::f16, MVT::f32, MVT::f64},
Expand);
431 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
432 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
433 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
440 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
441 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
449 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
450 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
451 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
452 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
453 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
454 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
455 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
461 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
462 for (
MVT VT : ScalarIntVTs) {
501 for (
auto VT : {MVT::i8, MVT::i16})
505 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
506 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
508 for (
MVT VT : VectorIntTypes) {
526 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
527 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
529 for (
MVT VT : FloatVectorTypes) {
634 const auto Flags =
Op.getNode()->getFlags();
635 if (Flags.hasNoSignedZeros())
684 unsigned Opc =
N->getOpcode();
705 return (
N->getNumOperands() > 2 &&
N->getOpcode() !=
ISD::SELECT) ||
714 return N->getValueType(0) == MVT::f32;
721 if (isa<MemSDNode>(
N))
724 switch (
N->getOpcode()) {
739 switch (
N->getConstantOperandVal(0)) {
740 case Intrinsic::amdgcn_interp_p1:
741 case Intrinsic::amdgcn_interp_p2:
742 case Intrinsic::amdgcn_interp_mov:
743 case Intrinsic::amdgcn_interp_p1_f16:
744 case Intrinsic::amdgcn_interp_p2_f16:
764 unsigned NumMayIncreaseSize = 0;
765 MVT VT =
N->getValueType(0).getScalarType().getSimpleVT();
770 for (
const SDNode *U :
N->uses()) {
805 bool ForCodeSize)
const {
807 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
814 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
831 EVT OldVT =
N->getValueType(0);
839 if (OldSize >= 32 && NewSize < 32 && MN->
getAlign() >=
Align(4) &&
854 return (OldSize < 32);
869 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
874 CastTy, MMO, &
Fast) &&
890 switch (
N->getOpcode()) {
895 unsigned IntrID =
N->getConstantOperandVal(0);
913 switch (
Op.getOpcode()) {
923 EVT VT =
Op.getValueType();
948 return VT == MVT::f32 || VT == MVT::f64 ||
949 (Subtarget->
has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
956 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
980 unsigned SrcSize = Source.getSizeInBits();
983 return DestSize < SrcSize && DestSize % 32 == 0 ;
989 unsigned SrcSize = Source->getScalarSizeInBits();
993 return SrcSize >= 32;
995 return DestSize < SrcSize && DestSize % 32 == 0;
999 unsigned SrcSize = Src->getScalarSizeInBits();
1003 return DestSize >= 32;
1005 return SrcSize == 32 && DestSize == 64;
1014 if (Src == MVT::i16)
1015 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1017 return Src == MVT::i32 && Dest == MVT::i64;
1034 "Expected shift op");
1042 if (
N->getValueType(0) == MVT::i32 &&
N->use_size() == 1 &&
1043 (
N->use_begin()->getOpcode() ==
ISD::SRA ||
1044 N->use_begin()->getOpcode() ==
ISD::SRL))
1051 auto *RHSLd = dyn_cast<LoadSDNode>(
RHS);
1052 auto *LHS0 = dyn_cast<LoadSDNode>(
LHS.getOperand(0));
1053 auto *LHS1 = dyn_cast<ConstantSDNode>(
LHS.getOperand(1));
1054 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() ==
ISD::ZEXTLOAD &&
1055 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1060 return !(IsShiftAndLoad(
LHS,
RHS) || IsShiftAndLoad(
RHS,
LHS));
1080 return CC_AMDGPU_CS_CHAIN;
1084 return CC_AMDGPU_Func;
1109 return RetCC_SI_Shader;
1111 return RetCC_SI_Gfx;
1115 return RetCC_AMDGPU_Func;
1154 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1161 unsigned InIndex = 0;
1164 const bool IsByRef = Arg.hasByRefAttr();
1165 Type *BaseArgTy = Arg.getType();
1166 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1167 Align Alignment =
DL.getValueOrABITypeAlignment(
1168 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1169 MaxAlign = std::max(Alignment, MaxAlign);
1170 uint64_t AllocSize =
DL.getTypeAllocSize(MemArgTy);
1172 uint64_t ArgOffset =
alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1173 ExplicitArgOffset =
alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1186 for (
unsigned Value = 0, NumValues = ValueVTs.
size();
1224 }
else if (RegisterVT.
isVector()) {
1227 assert(MemoryBits % NumElements == 0);
1231 MemoryBits / NumElements);
1249 unsigned PartOffset = 0;
1250 for (
unsigned i = 0; i != NumRegs; ++i) {
1252 BasePartOffset + PartOffset,
1291 int ClobberedFI)
const {
1294 int64_t LastByte = FirstByte + MFI.
getObjectSize(ClobberedFI) - 1;
1303 if (
LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1305 if (FI->getIndex() < 0) {
1307 int64_t InLastByte = InFirstByte;
1310 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1311 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1333 FuncName =
G->getSymbol();
1335 FuncName =
G->getGlobal()->getName();
1367 switch (
Op.getOpcode()) {
1371 "instruction is not implemented yet!");
1416 switch (
N->getOpcode()) {
1462 if (std::optional<uint32_t>
Address =
1471 GV->
getName() !=
"llvm.amdgcn.module.lds") {
1475 Fn,
"local memory global used by non-kernel function",
1493 "Do not know what to do with an non-zero offset");
1509 EVT VT =
Op.getValueType();
1511 unsigned OpBitSize =
Op.getOperand(0).getValueType().getSizeInBits();
1512 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1513 unsigned NewNumElt = OpBitSize / 32;
1514 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1516 MVT::i32, NewNumElt);
1517 for (
const SDUse &U :
Op->ops()) {
1523 Args.push_back(NewIn);
1533 for (
const SDUse &U :
Op->ops())
1543 unsigned Start =
Op.getConstantOperandVal(1);
1544 EVT VT =
Op.getValueType();
1545 EVT SrcVT =
Op.getOperand(0).getValueType();
1550 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 &&
"expect legal types");
1554 EVT NewVT = NumElt == 2
1666 if ((
LHS == True &&
RHS == False) || (
LHS == False &&
RHS == True))
1685 if (
LHS == NegTrue && CFalse && CRHS) {
1699std::pair<SDValue, SDValue>
1711 return std::pair(
Lo,
Hi);
1740 HiVT = NumElts - LoNumElts == 1
1743 return std::pair(LoVT, HiVT);
1748std::pair<SDValue, SDValue>
1750 const EVT &LoVT,
const EVT &HiVT,
1754 N.getValueType().getVectorNumElements() &&
1755 "More vector elements requested than available!");
1761 return std::pair(
Lo,
Hi);
1767 EVT VT =
Op.getValueType();
1779 SDValue BasePtr = Load->getBasePtr();
1780 EVT MemVT = Load->getMemoryVT();
1785 EVT LoMemVT, HiMemVT;
1793 Align BaseAlign = Load->getAlign();
1797 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1798 BaseAlign, Load->getMemOperand()->getFlags());
1801 DAG.
getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1803 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1827 EVT VT =
Op.getValueType();
1828 SDValue BasePtr = Load->getBasePtr();
1829 EVT MemVT = Load->getMemoryVT();
1832 Align BaseAlign = Load->getAlign();
1837 if (NumElements != 3 ||
1838 (BaseAlign <
Align(8) &&
1842 assert(NumElements == 3);
1849 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1850 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1861 SDValue Val = Store->getValue();
1869 EVT MemVT = Store->getMemoryVT();
1870 SDValue Chain = Store->getChain();
1871 SDValue BasePtr = Store->getBasePtr();
1875 EVT LoMemVT, HiMemVT;
1885 Align BaseAlign = Store->getAlign();
1890 DAG.
getTruncStore(Chain, SL,
Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1891 Store->getMemOperand()->getFlags());
1894 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1905 EVT VT =
Op.getValueType();
1908 MVT IntVT = MVT::i32;
1909 MVT FltVT = MVT::f32;
1912 if (LHSSignBits < 9)
1916 if (RHSSignBits < 9)
1920 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1921 unsigned DivBits = BitSize - SignBits;
1965 bool UseFmadFtz =
false;
1966 if (Subtarget->
isGCN()) {
2021 EVT VT =
Op.getValueType();
2023 assert(VT == MVT::i64 &&
"LowerUDIVREM64 expects an i64");
2096 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2109 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2123 std::tie(Mul3_Lo, Mul3_Hi) = DAG.
SplitScalar(Mul3,
DL, HalfVT, HalfVT);
2202 for (
unsigned i = 0; i < halfBitWidth; ++i) {
2203 const unsigned bitPos = halfBitWidth - i - 1;
2234 EVT VT =
Op.getValueType();
2236 if (VT == MVT::i64) {
2242 if (VT == MVT::i32) {
2289 EVT VT =
Op.getValueType();
2297 if (VT == MVT::i32) {
2302 if (VT == MVT::i64 &&
2349 EVT VT =
Op.getValueType();
2350 auto Flags =
Op->getFlags();
2388 const unsigned FractBits = 52;
2389 const unsigned ExpBits = 11;
2405 assert(
Op.getValueType() == MVT::f64);
2415 const unsigned FractBits = 52;
2427 = DAG.
getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2452 assert(
Op.getValueType() == MVT::f64);
2485 auto VT =
Op.getValueType();
2486 auto Arg =
Op.getOperand(0u);
2498 EVT VT =
Op.getValueType();
2549 switch (Src.getOpcode()) {
2551 return Src.getOperand(0).getValueType() == MVT::f16;
2556 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2557 switch (IntrinsicID) {
2558 case Intrinsic::amdgcn_frexp_mant:
2573 if (Flags.hasApproximateFuncs())
2592 EVT VT = Src.getValueType();
2603 return IsLtSmallestNormal;
2609 EVT VT = Src.getValueType();
2622std::pair<SDValue, SDValue>
2643 return {ScaledInput, IsLtSmallestNormal};
2654 EVT VT =
Op.getValueType();
2658 if (VT == MVT::f16) {
2667 auto [ScaledInput, IsLtSmallestNormal] =
2690 EVT VT =
Op.getValueType();
2698 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2724 const float c_log10 = 0x1.344134p-2f;
2725 const float cc_log10 = 0x1.09f79ep-26f;
2728 const float c_log = 0x1.62e42ep-1f;
2729 const float cc_log = 0x1.efa39ep-25f;
2741 const float ch_log10 = 0x1.344000p-2f;
2742 const float ct_log10 = 0x1.3509f6p-18f;
2745 const float ch_log = 0x1.62e000p-1f;
2746 const float ct_log = 0x1.0bfbe8p-15f;
2763 const bool IsFiniteOnly = (Flags.hasNoNaNs() ||
Options.NoNaNsFPMath) &&
2764 (Flags.hasNoInfs() ||
Options.NoInfsFPMath);
2767 if (!IsFiniteOnly) {
2793 EVT VT = Src.getValueType();
2797 double Log2BaseInverted =
2800 if (VT == MVT::f32) {
2810 ScaledResultOffset, Zero, Flags);
2825 return DAG.
getNode(
ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2834 EVT VT =
Op.getValueType();
2838 if (VT == MVT::f16) {
2883 EVT VT =
X.getValueType();
2891 SL, VT,
Mul, Flags);
2923 const EVT VT =
X.getValueType();
2973 EVT VT =
Op.getValueType();
3036 const float cc_exp = 0x1.4ae0bep-26f;
3037 const float c_exp10 = 0x1.a934f0p+1f;
3038 const float cc_exp10 = 0x1.2f346ep-24f;
3048 const float ch_exp = 0x1.714000p+0f;
3049 const float cl_exp = 0x1.47652ap-12f;
3051 const float ch_exp10 = 0x1.a92000p+1f;
3052 const float cl_exp10 = 0x1.4f0978p-11f;
3067 PL =
getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3082 DAG.
getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3092 if (!Flags.hasNoInfs() && !
Options.NoInfsFPMath) {
3094 DAG.
getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3116 auto Opc =
Op.getOpcode();
3117 auto Arg =
Op.getOperand(0u);
3118 auto ResultVT =
Op.getValueType();
3120 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3124 assert(ResultVT == Arg.getValueType());
3126 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3133 NewOp = DAG.
getNode(Opc, SL, MVT::i32, NewOp);
3136 NewOp = DAG.
getNode(Opc, SL, MVT::i32, NewOp);
3153 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3155 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3169 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3189 OprLo = DAG.
getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3191 OprHi = DAG.
getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3308 if (Subtarget->
isGCN())
3351 EVT DestVT =
Op.getValueType();
3353 EVT SrcVT = Src.getValueType();
3355 if (SrcVT == MVT::i16) {
3356 if (DestVT == MVT::f16)
3365 if (DestVT == MVT::bf16) {
3372 if (SrcVT != MVT::i64)
3387 if (DestVT == MVT::f32)
3390 assert(DestVT == MVT::f64);
3396 EVT DestVT =
Op.getValueType();
3399 EVT SrcVT = Src.getValueType();
3401 if (SrcVT == MVT::i16) {
3402 if (DestVT == MVT::f16)
3411 if (DestVT == MVT::bf16) {
3418 if (SrcVT != MVT::i64)
3436 if (DestVT == MVT::f32)
3439 assert(DestVT == MVT::f64);
3448 EVT SrcVT = Src.getValueType();
3450 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3463 if (
Signed && SrcVT == MVT::f32) {
3476 if (SrcVT == MVT::f64) {
3478 llvm::bit_cast<double>(UINT64_C( 0x3df0000000000000)), SL,
3481 llvm::bit_cast<double>(UINT64_C( 0xc1f0000000000000)), SL,
3485 llvm::bit_cast<float>(UINT32_C( 0x2f800000)), SL, SrcVT);
3487 llvm::bit_cast<float>(UINT32_C( 0xcf800000)), SL, SrcVT);
3498 SL, MVT::i32, FloorMul);
3504 if (
Signed && SrcVT == MVT::f32) {
3534 const unsigned ExpMask = 0x7ff;
3535 const unsigned ExpBiasf64 = 1023;
3536 const unsigned ExpBiasf16 = 15;
3620 unsigned OpOpcode =
Op.getOpcode();
3621 EVT SrcVT = Src.getValueType();
3622 EVT DestVT =
Op.getValueType();
3625 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3628 if (SrcVT == MVT::bf16) {
3631 return DAG.
getNode(
Op.getOpcode(),
DL, DestVT, PromotedSrc);
3635 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3642 if (DestVT != MVT::i64)
3645 if (SrcVT == MVT::f16 ||
3652 return DAG.
getNode(Ext,
DL, MVT::i64, FpToInt32);
3655 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3663 EVT ExtraVT = cast<VTSDNode>(
Op.getOperand(1))->getVT();
3664 MVT VT =
Op.getSimpleValueType();
3678 for (
unsigned I = 0;
I < NElts; ++
I)
3693 EVT VT =
Op.getValueType();
3707 unsigned NewOpcode = Node24->
getOpcode();
3711 case Intrinsic::amdgcn_mul_i24:
3714 case Intrinsic::amdgcn_mul_u24:
3717 case Intrinsic::amdgcn_mulhi_i24:
3720 case Intrinsic::amdgcn_mulhi_u24:
3735 if (DemandedLHS || DemandedRHS)
3737 DemandedLHS ? DemandedLHS :
LHS,
3738 DemandedRHS ? DemandedRHS :
RHS);
3750template <
typename IntTy>
3753 if (Width +
Offset < 32) {
3755 IntTy Result =
static_cast<IntTy
>(Shl) >> (32 - Width);
3764 if (
MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3765 if (M->isVolatile())