27#include "llvm/IR/IntrinsicsAMDGPU.h" 
   34#define DEBUG_TYPE "AMDGPUtti" 
   37  "amdgpu-unroll-threshold-private",
 
   38  cl::desc(
"Unroll threshold for AMDGPU if private memory used in a loop"),
 
   42  "amdgpu-unroll-threshold-local",
 
   43  cl::desc(
"Unroll threshold for AMDGPU if local memory used in a loop"),
 
   47  "amdgpu-unroll-threshold-if",
 
   48  cl::desc(
"Unroll threshold increment for AMDGPU for each if statement inside loop"),
 
   52  "amdgpu-unroll-runtime-local",
 
   53  cl::desc(
"Allow runtime unroll for AMDGPU if local memory used in a loop"),
 
   57    "amdgpu-unroll-max-block-to-analyze",
 
   58    cl::desc(
"Inner loop block size threshold to analyze in unroll for AMDGPU"),
 
   63                                       cl::desc(
"Cost of alloca argument"));
 
   71                    cl::desc(
"Maximum alloca size to use for inline cost"));
 
   76    cl::desc(
"Maximum number of BBs allowed in a function after inlining" 
   77             " (compile time constraint)"));
 
   81    "amdgpu-memcpy-loop-unroll",
 
   82    cl::desc(
"Unroll factor (affecting 4x32-bit operations) to use for memory " 
   83             "operations when lowering memcpy as a loop"),
 
   92  for (
const Value *V : 
I->operand_values()) {
 
   97                  return SubLoop->contains(PHI); }))
 
 
  107      TargetTriple(TM->getTargetTriple()),
 
  109      TLI(ST->getTargetLowering()) {}
 
 
  114  const Function &
F = *L->getHeader()->getParent();
 
  116      F.getFnAttributeAsParsedInteger(
"amdgpu-unroll-threshold", 300);
 
  117  UP.
MaxCount = std::numeric_limits<unsigned>::max();
 
  130  const unsigned MaxAlloca = (256 - 16) * 4;
 
  136  if (
MDNode *LoopUnrollThreshold =
 
  138    if (LoopUnrollThreshold->getNumOperands() == 2) {
 
  140          LoopUnrollThreshold->getOperand(1));
 
  141      if (MetaThresholdValue) {
 
  147        ThresholdPrivate = std::min(ThresholdPrivate, UP.
Threshold);
 
  148        ThresholdLocal = std::min(ThresholdLocal, UP.
Threshold);
 
  153  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
 
  156    unsigned LocalGEPsSeen = 0;
 
  159               return SubLoop->contains(BB); }))
 
  169        if (UP.
Threshold < MaxBoost && Br->isConditional()) {
 
  172          if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
 
  173              (L->contains(Succ1) && L->isLoopExiting(Succ1)))
 
  179                              << *L << 
" due to " << *Br << 
'\n');
 
  191      unsigned AS = 
GEP->getAddressSpace();
 
  192      unsigned Threshold = 0;
 
  194        Threshold = ThresholdPrivate;
 
  196        Threshold = ThresholdLocal;
 
  210        unsigned AllocaSize = Ty->
isSized() ? 
DL.getTypeAllocSize(Ty) : 0;
 
  211        if (AllocaSize > MaxAlloca)
 
  220        if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2)
 
  228                          << *L << 
" due to LDS use.\n");
 
  233      bool HasLoopDef = 
false;
 
  236        if (!Inst || L->isLoopInvariant(
Op))
 
  240             return SubLoop->contains(Inst); }))
 
  264                        << *L << 
" due to " << *
GEP << 
'\n');
 
 
  287    AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
 
  288    AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
 
  289    AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
 
  290    AMDGPU::FeatureUnalignedAccessMode,
 
  292    AMDGPU::FeatureAutoWaitcntBeforeBarrier,
 
  295    AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
 
  296    AMDGPU::FeatureTrapHandler,
 
  300    AMDGPU::FeatureSRAMECC,
 
  303    AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
 
  308      TLI(ST->getTargetLowering()), CommonTTI(TM, 
F),
 
  309      IsGraphics(
AMDGPU::isGraphics(
F.getCallingConv())) {
 
  312  HasFP64FP16Denormals =
 
 
  317  return !
F || !ST->isSingleLaneExecution(*
F);
 
 
  349  if (Opcode == Instruction::Load || Opcode == Instruction::Store)
 
  350    return 32 * 4 / ElemWidth;
 
  353  return (ElemWidth == 8 && ST->has16BitInsts())       ? 4
 
  354         : (ElemWidth == 16 && ST->has16BitInsts())    ? 2
 
  355         : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
 
 
  360                                         unsigned ChainSizeInBytes,
 
  362  unsigned VecRegBitWidth = VF * LoadSize;
 
  365    return 128 / LoadSize;
 
 
  371                                             unsigned ChainSizeInBytes,
 
  373  unsigned VecRegBitWidth = VF * StoreSize;
 
  374  if (VecRegBitWidth > 128)
 
  375    return 128 / StoreSize;
 
 
  391    return 8 * ST->getMaxPrivateElementSize();
 
 
  399                                            unsigned AddrSpace)
 const {
 
  404    return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
 
  405           ChainSizeInBytes <= ST->getMaxPrivateElementSize();
 
 
  412                                             unsigned AddrSpace)
 const {
 
 
  418                                              unsigned AddrSpace)
 const {
 
 
  428    unsigned DestAddrSpace, 
Align SrcAlign, 
Align DestAlign,
 
  429    std::optional<uint32_t> AtomicElementSize)
 const {
 
  431  if (AtomicElementSize)
 
  445  unsigned I32EltsInVector = 4;
 
 
  455    unsigned RemainingBytes, 
unsigned SrcAddrSpace, 
unsigned DestAddrSpace,
 
  457    std::optional<uint32_t> AtomicCpySize)
 const {
 
  461        OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
 
  462        DestAlign, AtomicCpySize);
 
  465  while (RemainingBytes >= 16) {
 
  467    RemainingBytes -= 16;
 
  471  while (RemainingBytes >= 8) {
 
  477  while (RemainingBytes >= 4) {
 
  483  while (RemainingBytes >= 2) {
 
  489  while (RemainingBytes) {
 
 
  507  case Intrinsic::amdgcn_ds_ordered_add:
 
  508  case Intrinsic::amdgcn_ds_ordered_swap: {
 
  511    if (!Ordering || !Volatile)
 
  514    unsigned OrderingVal = Ordering->getZExtValue();
 
  521    Info.WriteMem = 
true;
 
  522    Info.IsVolatile = !Volatile->isZero();
 
 
  536  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
 
  537  int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
  541  unsigned NElts = LT.second.isVector() ?
 
  542    LT.second.getVectorNumElements() : 1;
 
  551      return get64BitInstrCost(
CostKind) * LT.first * NElts;
 
  553    if (ST->has16BitInsts() && SLT == MVT::i16)
 
  554      NElts = (NElts + 1) / 2;
 
  557    return getFullRateInstrCost() * LT.first * NElts;
 
  563    if (SLT == MVT::i64) {
 
  565      return 2 * getFullRateInstrCost() * LT.first * NElts;
 
  568    if (ST->has16BitInsts() && SLT == MVT::i16)
 
  569      NElts = (NElts + 1) / 2;
 
  571    return LT.first * NElts * getFullRateInstrCost();
 
  573    const int QuarterRateCost = getQuarterRateInstrCost(
CostKind);
 
  574    if (SLT == MVT::i64) {
 
  575      const int FullRateCost = getFullRateInstrCost();
 
  576      return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
 
  579    if (ST->has16BitInsts() && SLT == MVT::i16)
 
  580      NElts = (NElts + 1) / 2;
 
  583    return QuarterRateCost * NElts * LT.first;
 
  591        const int OPC = TLI->InstructionOpcodeToISD(
FAdd->getOpcode());
 
  593          if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
 
  595          if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
 
  608    if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
 
  609      NElts = (NElts + 1) / 2;
 
  610    if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
 
  611      NElts = (NElts + 1) / 2;
 
  613      return LT.first * NElts * get64BitInstrCost(
CostKind);
 
  615    if (ST->has16BitInsts() && SLT == MVT::f16)
 
  616      NElts = (NElts + 1) / 2;
 
  618    if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
 
  619      return LT.first * NElts * getFullRateInstrCost();
 
  625    if (SLT == MVT::f64) {
 
  630      if (!ST->hasUsableDivScaleConditionOutput())
 
  631        Cost += 3 * getFullRateInstrCost();
 
  633      return LT.first * 
Cost * NElts;
 
  638      if ((SLT == MVT::f32 && !HasFP32Denormals) ||
 
  639          (SLT == MVT::f16 && ST->has16BitInsts())) {
 
  640        return LT.first * getQuarterRateInstrCost(
CostKind) * NElts;
 
  644    if (SLT == MVT::f16 && ST->has16BitInsts()) {
 
  651          4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(
CostKind);
 
  652      return LT.first * 
Cost * NElts;
 
  659      int Cost = getQuarterRateInstrCost(
CostKind) + getFullRateInstrCost();
 
  660      return LT.first * 
Cost * NElts;
 
  663    if (SLT == MVT::f32 || SLT == MVT::f16) {
 
  665      int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
 
  666                 1 * getQuarterRateInstrCost(
CostKind);
 
  668      if (!HasFP32Denormals) {
 
  670        Cost += 2 * getFullRateInstrCost();
 
  673      return LT.first * NElts * 
Cost;
 
  679    return TLI->isFNegFree(SLT) ? 0 : NElts;
 
 
  693  case Intrinsic::fmuladd:
 
  694  case Intrinsic::copysign:
 
  695  case Intrinsic::minimumnum:
 
  696  case Intrinsic::maximumnum:
 
  697  case Intrinsic::canonicalize:
 
  699  case Intrinsic::round:
 
  700  case Intrinsic::uadd_sat:
 
  701  case Intrinsic::usub_sat:
 
  702  case Intrinsic::sadd_sat:
 
  703  case Intrinsic::ssub_sat:
 
 
  714  switch (ICA.
getID()) {
 
  715  case Intrinsic::fabs:
 
  718  case Intrinsic::amdgcn_workitem_id_x:
 
  719  case Intrinsic::amdgcn_workitem_id_y:
 
  720  case Intrinsic::amdgcn_workitem_id_z:
 
  724  case Intrinsic::amdgcn_workgroup_id_x:
 
  725  case Intrinsic::amdgcn_workgroup_id_y:
 
  726  case Intrinsic::amdgcn_workgroup_id_z:
 
  727  case Intrinsic::amdgcn_lds_kernel_id:
 
  728  case Intrinsic::amdgcn_dispatch_ptr:
 
  729  case Intrinsic::amdgcn_dispatch_id:
 
  730  case Intrinsic::amdgcn_implicitarg_ptr:
 
  731  case Intrinsic::amdgcn_queue_ptr:
 
  744  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
 
  746  unsigned NElts = LT.second.isVector() ?
 
  747    LT.second.getVectorNumElements() : 1;
 
  751  if ((ST->hasVOP3PInsts() &&
 
  752       (SLT == MVT::f16 || SLT == MVT::i16 ||
 
  753        (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
 
  754      (ST->hasPackedFP32Ops() && SLT == MVT::f32))
 
  755    NElts = (NElts + 1) / 2;
 
  758  unsigned InstRate = getQuarterRateInstrCost(
CostKind);
 
  760  switch (ICA.
getID()) {
 
  762  case Intrinsic::fmuladd:
 
  763    if (SLT == MVT::f64) {
 
  764      InstRate = get64BitInstrCost(
CostKind);
 
  768    if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
 
  769      InstRate = getFullRateInstrCost();
 
  771      InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(
CostKind)
 
  772                                     : getQuarterRateInstrCost(
CostKind);
 
  775  case Intrinsic::copysign:
 
  776    return NElts * getFullRateInstrCost();
 
  777  case Intrinsic::minimumnum:
 
  778  case Intrinsic::maximumnum: {
 
  790        SLT == MVT::f64 ? get64BitInstrCost(
CostKind) : getFullRateInstrCost();
 
  791    InstRate = BaseRate * 
NumOps;
 
  794  case Intrinsic::canonicalize: {
 
  796        SLT == MVT::f64 ? get64BitInstrCost(
CostKind) : getFullRateInstrCost();
 
  799  case Intrinsic::uadd_sat:
 
  800  case Intrinsic::usub_sat:
 
  801  case Intrinsic::sadd_sat:
 
  802  case Intrinsic::ssub_sat: {
 
  803    if (SLT == MVT::i16 || SLT == MVT::i32)
 
  804      InstRate = getFullRateInstrCost();
 
  806    static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
 
  807    if (
any_of(ValidSatTys, [<](
MVT M) { 
return M == LT.second; }))
 
  813    if (SLT == MVT::i16 || SLT == MVT::i32)
 
  814      InstRate = 2 * getFullRateInstrCost();
 
  820  return LT.first * NElts * InstRate;
 
 
  826  assert((
I == 
nullptr || 
I->getOpcode() == Opcode) &&
 
  827         "Opcode should reflect passed instruction.");
 
  830  const int CBrCost = SCost ? 5 : 7;
 
  832  case Instruction::Br: {
 
  835    if (BI && BI->isUnconditional())
 
  836      return SCost ? 1 : 4;
 
  841  case Instruction::Switch: {
 
  845    return (
SI ? (
SI->getNumCases() + 1) : 4) * (CBrCost + 1);
 
  847  case Instruction::Ret:
 
  848    return SCost ? 1 : 10;
 
 
  855                                       std::optional<FastMathFlags> FMF,
 
  860  EVT OrigTy = TLI->getValueType(
DL, Ty);
 
  867  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
 
  868  return LT.first * getFullRateInstrCost();
 
 
  875  EVT OrigTy = TLI->getValueType(
DL, Ty);
 
  882  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
 
  883  return LT.first * getHalfRateInstrCost(
CostKind);
 
 
  888                                               unsigned Index, 
const Value *Op0,
 
  889                                               const Value *Op1)
 const {
 
  891  case Instruction::ExtractElement:
 
  892  case Instruction::InsertElement: {
 
  896      if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
 
  907    return Index == ~0u ? 2 : 0;
 
 
  921  if (Indices.
size() > 1)
 
  927      TLI->ParseConstraints(
DL, ST->getRegisterInfo(), *CI);
 
  929  const int TargetOutputIdx = Indices.
empty() ? -1 : Indices[0];
 
  932  for (
auto &TC : TargetConstraints) {
 
  937    if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
 
  940    TLI->ComputeConstraintToUse(TC, 
SDValue());
 
  943        TRI, TC.ConstraintCode, TC.ConstraintVT).second;
 
  947    if (!RC || !
TRI->isSGPRClass(RC))
 
 
 1001    case Intrinsic::read_register:
 
 1003    case Intrinsic::amdgcn_addrspacecast_nonnull: {
 
 1005          Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
 
 1006      unsigned DstAS = 
Intrinsic->getType()->getPointerAddressSpace();
 
 1009             ST->hasGloballyAddressableScratch();
 
 1011    case Intrinsic::amdgcn_workitem_id_y:
 
 1012    case Intrinsic::amdgcn_workitem_id_z: {
 
 1015          ST->hasWavefrontsEvenlySplittingXDim(*
F, 
true);
 
 1016      std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
 
 1017          *
F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
 
 1018      return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
 
 1027    if (CI->isInlineAsm())
 
 1042           ST->hasGloballyAddressableScratch();
 
 
 1053    if (CI->isInlineAsm())
 
 1071  bool XDimDoesntResetWithinWaves = 
false;
 
 1074    XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*
F);
 
 1082    return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
 
 1089               ST->getWavefrontSizeLog2() &&
 
 1090           XDimDoesntResetWithinWaves;
 
 1105    case Intrinsic::amdgcn_if:
 
 1106    case Intrinsic::amdgcn_else: {
 
 1108      return Indices.
size() == 1 && Indices[0] == 1;
 
 
 1125  case Intrinsic::amdgcn_is_shared:
 
 1126  case Intrinsic::amdgcn_is_private:
 
 1127  case Intrinsic::amdgcn_flat_atomic_fmax_num:
 
 1128  case Intrinsic::amdgcn_flat_atomic_fmin_num:
 
 1129  case Intrinsic::amdgcn_load_to_lds:
 
 1130  case Intrinsic::amdgcn_make_buffer_rsrc:
 
 
 1140                                                    Value *NewV)
 const {
 
 1141  auto IntrID = 
II->getIntrinsicID();
 
 1143  case Intrinsic::amdgcn_is_shared:
 
 1144  case Intrinsic::amdgcn_is_private: {
 
 1145    unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
 
 1153  case Intrinsic::ptrmask: {
 
 1156    Value *MaskOp = 
II->getArgOperand(1);
 
 1159    bool DoTruncate = 
false;
 
 1163    if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
 
 1167      if (
DL.getPointerSizeInBits(OldAS) != 64 ||
 
 1168          DL.getPointerSizeInBits(NewAS) != 32)
 
 1181      MaskTy = 
B.getInt32Ty();
 
 1182      MaskOp = 
B.CreateTrunc(MaskOp, MaskTy);
 
 1185    return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->
getType(), MaskTy},
 
 1188  case Intrinsic::amdgcn_flat_atomic_fmax_num:
 
 1189  case Intrinsic::amdgcn_flat_atomic_fmin_num: {
 
 1190    Type *DestTy = 
II->getType();
 
 1197        M, 
II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
 
 1198    II->setArgOperand(0, NewV);
 
 1199    II->setCalledFunction(NewDecl);
 
 1202  case Intrinsic::amdgcn_load_to_lds: {
 
 1207    II->setArgOperand(0, NewV);
 
 1208    II->setCalledFunction(NewDecl);
 
 1211  case Intrinsic::amdgcn_make_buffer_rsrc: {
 
 1213    Type *DstTy = 
II->getType();
 
 1216        M, 
II->getIntrinsicID(), {DstTy, SrcTy});
 
 1217    II->setArgOperand(0, NewV);
 
 1218    II->setCalledFunction(NewDecl);
 
 
 1239  unsigned ScalarSize = 
DL.getTypeSizeInBits(SrcTy->getElementType());
 
 1241      (ScalarSize == 16 || ScalarSize == 8)) {
 
 1245    unsigned RequestedElts =
 
 1246        count_if(Mask, [](
int MaskElt) { 
return MaskElt != -1; });
 
 1247    unsigned EltsPerReg = 32 / ScalarSize;
 
 1248    if (RequestedElts == 0)
 
 1256      if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
 
 1258      unsigned NumPerms = 
alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
 
 1261      return NumPerms + NumPermMasks;
 
 1270      return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
 
 1275      unsigned NumPerms = 
alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
 
 1278      return NumPerms + NumPermMasks;
 
 
 1297  for (
auto &
Op : 
I->operands()) {
 
 1306  return !
Ops.empty();
 
 
 1313    = 
static_cast<const GCNSubtarget *
>(TM.getSubtargetImpl(*Caller));
 
 1315    = 
static_cast<const GCNSubtarget *
>(TM.getSubtargetImpl(*Callee));
 
 1317  const FeatureBitset &CallerBits = CallerST->getFeatureBits();
 
 1318  const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
 
 1320  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
 
 1321  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
 
 1322  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
 
 1332  if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
 
 1333      Callee->hasFnAttribute(Attribute::InlineHint))
 
 1339    if (Callee->size() == 1)
 
 1341    size_t BBSize = Caller->size() + Callee->size() - 1;
 
 
 1351  const int NrOfSGPRUntilSpill = 26;
 
 1352  const int NrOfVGPRUntilSpill = 32;
 
 1356  unsigned adjustThreshold = 0;
 
 1362    for (
auto ArgVT : ValueVTs) {
 
 1366        SGPRsInUse += CCRegNum;
 
 1368        VGPRsInUse += CCRegNum;
 
 1378  ArgStackCost += 
const_cast<GCNTTIImpl *
>(TTIImpl)->getMemoryOpCost(
 
 1381  ArgStackCost += 
const_cast<GCNTTIImpl *
>(TTIImpl)->getMemoryOpCost(
 
 1387  adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
 
 1389  adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
 
 1391  return adjustThreshold;
 
 
 1400  unsigned AllocaSize = 0;
 
 1407    unsigned AddrSpace = Ty->getAddressSpace();
 
 
 1460  static_assert(InlinerVectorBonusPercent == 0, 
"vector bonus assumed to be 0");
 
 1464    return BB.getTerminator()->getNumSuccessors() > 1;
 
 1467    Threshold += Threshold / 2;
 
 1473  unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
 
 1475  return AllocaThresholdBonus;
 
 
 1481  CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
 
 
 1486  CommonTTI.getPeelingPreferences(L, SE, PP);
 
 
 1491             ? getFullRateInstrCost()
 
 1492             : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(
CostKind)
 
 1493                                      : getQuarterRateInstrCost(
CostKind);
 
 1496std::pair<InstructionCost, MVT>
 
 1497GCNTTIImpl::getTypeLegalizationCost(
Type *Ty)
 const {
 
 1499  auto Size = 
DL.getTypeSizeInBits(Ty);
 
 1506  Cost.first += (
Size + 255) / 256;
 
 1511  return ST->hasPrefetch() ? 128 : 0;
 
 
 1522  LB.
push_back({
"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
 
 1523  LB.push_back({
"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
 
 1524  LB.push_back({
"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
 
 1525  std::pair<unsigned, unsigned> FlatWorkGroupSize =
 
 1526      ST->getFlatWorkGroupSizes(
F);
 
 1527  LB.push_back({
"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
 
 1528  LB.push_back({
"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
 
 1529  std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(
F);
 
 1530  LB.push_back({
"amdgpu-waves-per-eu[0]", WavesPerEU.first});
 
 1531  LB.push_back({
"amdgpu-waves-per-eu[1]", WavesPerEU.second});
 
 
 1536  if (!ST->hasIEEEMode()) 
 
 1543  Attribute IEEEAttr = 
F->getFnAttribute(
"amdgpu-ieee");
 
 
 1558    if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
 
 1559        VecTy->getElementType()->isIntegerTy(8)) {
 
 
 1570    if (VecTy->getElementType()->isIntegerTy(8)) {
 
 
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
Functions, function parameters, and return types can have attributes to indicate how they should be t...
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
bool isValid() const
Return true if the attribute is any kind of attribute.
LLVM Basic Block Representation.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
A parsed version of the target data layout string in and methods for querying it.
constexpr bool isScalar() const
Exactly one element.
Convenience struct for specifying and reasoning about fast-math flags.
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
bool hasFullRate64Ops() const
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool isAlwaysUniform(const Value *V) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool isSourceOfDivergence(const Value *V) const override
int getInliningLastCallToStaticBonus() const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Type * getReturnType() const
const IntrinsicInst * getInst() const
Intrinsic::ID getID() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
Represents a single loop in the control flow graph.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
A Module instance is used to store all the information related to an LLVM module.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::vector< AsmOperandInfo > AsmOperandInfoVector
Primary interface to the complete machine description for the target machine.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
A Use represents the edge between a Value definition and its users.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Base class of all SIMD vector types.
constexpr ScalarTy getFixedValue() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
@ ADD
Simple integer binary arithmetic operators.
@ FADD
Simple binary floating point operators.
@ SHL
Shift and rotation operations.
@ AND
Bitwise operators - logical and, logical or, logical xor.
LLVM_ABI int getInstrCost()
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
auto dyn_cast_or_null(const Y &Val)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
This struct is a compact representation of a valid (non-zero power of two) alignment.
static constexpr DenormalMode getPreserveSign()
uint64_t getScalarSizeInBits() const
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const