doxygen/html/AMDGPUSubtarget_8cpp_source.html

//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file

/// Implements the AMDGPU specific subclass of TargetSubtarget.

//

//===----------------------------------------------------------------------===//


#include "AMDGPUSubtarget.h"

#include "AMDGPUCallLowering.h"

#include "AMDGPUInstructionSelector.h"

#include "AMDGPULegalizerInfo.h"

#include "AMDGPURegisterBankInfo.h"

#include "AMDGPUTargetMachine.h"

#include "GCNSubtarget.h"

#include "R600Subtarget.h"

#include "SIMachineFunctionInfo.h"

#include "Utils/AMDGPUBaseInfo.h"

#include "llvm/ADT/SmallString.h"

#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"

#include "llvm/CodeGen/MachineScheduler.h"

#include "llvm/CodeGen/TargetFrameLowering.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/IR/IntrinsicsR600.h"

#include "llvm/IR/MDBuilder.h"

#include "llvm/MC/MCSubtargetInfo.h"

#include <algorithm>


using namespace llvm;


#define DEBUG_TYPE "amdgpu-subtarget"


#define GET_SUBTARGETINFO_TARGET_DESC

#define GET_SUBTARGETINFO_CTOR

#define AMDGPUSubtarget GCNSubtarget

#include "AMDGPUGenSubtargetInfo.inc"

#undef AMDGPUSubtarget


static cl::opt<bool> EnablePowerSched(

  "amdgpu-enable-power-sched",

  cl::desc("Enable scheduling to minimize mAI power bursts"),

  cl::init(false));


static cl::opt<bool> EnableVGPRIndexMode(

  "amdgpu-vgpr-index-mode",

  cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),

  cl::init(false));


static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",

                           cl::desc("Enable the use of AA during codegen."),

                           cl::init(true));


static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold",

                                      cl::desc("Number of addresses from which to enable MIMG NSA."),

                                      cl::init(3), cl::Hidden);


GCNSubtarget::~GCNSubtarget() = default;


GCNSubtarget &

GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,

                                              StringRef GPU, StringRef FS) {

  // Determine default and user-specified characteristics

  //

  // We want to be able to turn these off, but making this a subtarget feature

  // for SI has the unhelpful behavior that it unsets everything else if you

  // disable it.

  //

  // Similarly we want enable-prt-strict-null to be on by default and not to

  // unset everything else if it is disabled


  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");


  // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default

  if (isAmdHsaOS())

    FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";


  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS


  // Disable mutually exclusive bits.

  if (FS.contains_insensitive("+wavefrontsize")) {

    if (!FS.contains_insensitive("wavefrontsize16"))

      FullFS += "-wavefrontsize16,";

    if (!FS.contains_insensitive("wavefrontsize32"))

      FullFS += "-wavefrontsize32,";

    if (!FS.contains_insensitive("wavefrontsize64"))

      FullFS += "-wavefrontsize64,";

  }


  FullFS += FS;


  ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);


  // Implement the "generic" processors, which acts as the default when no

  // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to

  // the first amdgcn target that supports flat addressing. Other OSes defaults

  // to the first amdgcn target.

  if (Gen == AMDGPUSubtarget::INVALID) {

     Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS

                                        : AMDGPUSubtarget::SOUTHERN_ISLANDS;

  }


  // We don't support FP64 for EG/NI atm.

  assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));


  // Targets must either support 64-bit offsets for MUBUF instructions, and/or

  // support flat operations, otherwise they cannot access a 64-bit global

  // address space

  assert(hasAddr64() || hasFlat());

  // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets

  // that do not support ADDR64 variants of MUBUF instructions. Such targets

  // cannot use a 64 bit offset with a MUBUF instruction to access the global

  // address space

  if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {

    ToggleFeature(AMDGPU::FeatureFlatForGlobal);

    FlatForGlobal = true;

  }

  // Unless +-flat-for-global is specified, use MUBUF instructions for global

  // address space access if flat operations are not available.

  if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {

    ToggleFeature(AMDGPU::FeatureFlatForGlobal);

    FlatForGlobal = false;

  }


  // Set defaults if needed.

  if (MaxPrivateElementSize == 0)

    MaxPrivateElementSize = 4;


  if (LDSBankCount == 0)

    LDSBankCount = 32;


  if (TT.getArch() == Triple::amdgcn) {

    if (LocalMemorySize == 0)

      LocalMemorySize = 32768;


    // Do something sensible for unspecified target.

    if (!HasMovrel && !HasVGPRIndexMode)

      HasMovrel = true;

  }


  AddressableLocalMemorySize = LocalMemorySize;


  if (AMDGPU::isGFX10Plus(*this) &&

      !getFeatureBits().test(AMDGPU::FeatureCuMode))

    LocalMemorySize *= 2;


  // Don't crash on invalid devices.

  if (WavefrontSizeLog2 == 0)

    WavefrontSizeLog2 = 5;


  HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;

  HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;


  TargetID.setTargetIDFromFeaturesString(FS);


  LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "

                    << TargetID.getXnackSetting() << '\n');

  LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "

                    << TargetID.getSramEccSetting() << '\n');


  return *this;

}


AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {}


bool AMDGPUSubtarget::useRealTrue16Insts() const {

  return hasTrue16BitInsts() && EnableRealTrue16Insts;

}


GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,

                           const GCNTargetMachine &TM)

    : // clang-format off

    AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),

    AMDGPUSubtarget(TT),

    TargetTriple(TT),

    TargetID(*this),

    InstrItins(getInstrItineraryForCPU(GPU)),

    InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),

    TLInfo(TM, *this),

    FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {

  // clang-format on

  MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);

  EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this);

  CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));

  InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));

  Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));

  RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));

  InstSelector.reset(new AMDGPUInstructionSelector(

  *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));

}


unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {

  if (getGeneration() < GFX10)

    return 1;


  switch (Opcode) {

  case AMDGPU::V_LSHLREV_B64_e64:

  case AMDGPU::V_LSHLREV_B64_gfx10:

  case AMDGPU::V_LSHLREV_B64_e64_gfx11:

  case AMDGPU::V_LSHLREV_B64_e32_gfx12:

  case AMDGPU::V_LSHLREV_B64_e64_gfx12:

  case AMDGPU::V_LSHL_B64_e64:

  case AMDGPU::V_LSHRREV_B64_e64:

  case AMDGPU::V_LSHRREV_B64_gfx10:

  case AMDGPU::V_LSHRREV_B64_e64_gfx11:

  case AMDGPU::V_LSHRREV_B64_e64_gfx12:

  case AMDGPU::V_LSHR_B64_e64:

  case AMDGPU::V_ASHRREV_I64_e64:

  case AMDGPU::V_ASHRREV_I64_gfx10:

  case AMDGPU::V_ASHRREV_I64_e64_gfx11:

  case AMDGPU::V_ASHRREV_I64_e64_gfx12:

  case AMDGPU::V_ASHR_I64_e64:

    return 1;

  }


  return 2;

}


/// This list was mostly derived from experimentation.

bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {

  switch (Opcode) {

  case AMDGPU::V_CVT_F16_F32_e32:

  case AMDGPU::V_CVT_F16_F32_e64:

  case AMDGPU::V_CVT_F16_U16_e32:

  case AMDGPU::V_CVT_F16_U16_e64:

  case AMDGPU::V_CVT_F16_I16_e32:

  case AMDGPU::V_CVT_F16_I16_e64:

  case AMDGPU::V_RCP_F16_e64:

  case AMDGPU::V_RCP_F16_e32:

  case AMDGPU::V_RSQ_F16_e64:

  case AMDGPU::V_RSQ_F16_e32:

  case AMDGPU::V_SQRT_F16_e64:

  case AMDGPU::V_SQRT_F16_e32:

  case AMDGPU::V_LOG_F16_e64:

  case AMDGPU::V_LOG_F16_e32:

  case AMDGPU::V_EXP_F16_e64:

  case AMDGPU::V_EXP_F16_e32:

  case AMDGPU::V_SIN_F16_e64:

  case AMDGPU::V_SIN_F16_e32:

  case AMDGPU::V_COS_F16_e64:

  case AMDGPU::V_COS_F16_e32:

  case AMDGPU::V_FLOOR_F16_e64:

  case AMDGPU::V_FLOOR_F16_e32:

  case AMDGPU::V_CEIL_F16_e64:

  case AMDGPU::V_CEIL_F16_e32:

  case AMDGPU::V_TRUNC_F16_e64:

  case AMDGPU::V_TRUNC_F16_e32:

  case AMDGPU::V_RNDNE_F16_e64:

  case AMDGPU::V_RNDNE_F16_e32:

  case AMDGPU::V_FRACT_F16_e64:

  case AMDGPU::V_FRACT_F16_e32:

  case AMDGPU::V_FREXP_MANT_F16_e64:

  case AMDGPU::V_FREXP_MANT_F16_e32:

  case AMDGPU::V_FREXP_EXP_I16_F16_e64:

  case AMDGPU::V_FREXP_EXP_I16_F16_e32:

  case AMDGPU::V_LDEXP_F16_e64:

  case AMDGPU::V_LDEXP_F16_e32:

  case AMDGPU::V_LSHLREV_B16_e64:

  case AMDGPU::V_LSHLREV_B16_e32:

  case AMDGPU::V_LSHRREV_B16_e64:

  case AMDGPU::V_LSHRREV_B16_e32:

  case AMDGPU::V_ASHRREV_I16_e64:

  case AMDGPU::V_ASHRREV_I16_e32:

  case AMDGPU::V_ADD_U16_e64:

  case AMDGPU::V_ADD_U16_e32:

  case AMDGPU::V_SUB_U16_e64:

  case AMDGPU::V_SUB_U16_e32:

  case AMDGPU::V_SUBREV_U16_e64:

  case AMDGPU::V_SUBREV_U16_e32:

  case AMDGPU::V_MUL_LO_U16_e64:

  case AMDGPU::V_MUL_LO_U16_e32:

  case AMDGPU::V_ADD_F16_e64:

  case AMDGPU::V_ADD_F16_e32:

  case AMDGPU::V_SUB_F16_e64:

  case AMDGPU::V_SUB_F16_e32:

  case AMDGPU::V_SUBREV_F16_e64:

  case AMDGPU::V_SUBREV_F16_e32:

  case AMDGPU::V_MUL_F16_e64:

  case AMDGPU::V_MUL_F16_e32:

  case AMDGPU::V_MAX_F16_e64:

  case AMDGPU::V_MAX_F16_e32:

  case AMDGPU::V_MIN_F16_e64:

  case AMDGPU::V_MIN_F16_e32:

  case AMDGPU::V_MAX_U16_e64:

  case AMDGPU::V_MAX_U16_e32:

  case AMDGPU::V_MIN_U16_e64:

  case AMDGPU::V_MIN_U16_e32:

  case AMDGPU::V_MAX_I16_e64:

  case AMDGPU::V_MAX_I16_e32:

  case AMDGPU::V_MIN_I16_e64:

  case AMDGPU::V_MIN_I16_e32:

  case AMDGPU::V_MAD_F16_e64:

  case AMDGPU::V_MAD_U16_e64:

  case AMDGPU::V_MAD_I16_e64:

  case AMDGPU::V_FMA_F16_e64:

  case AMDGPU::V_DIV_FIXUP_F16_e64:

    // On gfx10, all 16-bit instructions preserve the high bits.

    return getGeneration() <= AMDGPUSubtarget::GFX9;

  case AMDGPU::V_MADAK_F16:

  case AMDGPU::V_MADMK_F16:

  case AMDGPU::V_MAC_F16_e64:

  case AMDGPU::V_MAC_F16_e32:

  case AMDGPU::V_FMAMK_F16:

  case AMDGPU::V_FMAAK_F16:

  case AMDGPU::V_FMAC_F16_e64:

  case AMDGPU::V_FMAC_F16_e32:

    // In gfx9, the preferred handling of the unused high 16-bits changed. Most

    // instructions maintain the legacy behavior of 0ing. Some instructions

    // changed to preserving the high bits.

    return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;

  case AMDGPU::V_MAD_MIXLO_F16:

  case AMDGPU::V_MAD_MIXHI_F16:

  default:

    return false;

  }

}


// Returns the maximum per-workgroup LDS allocation size (in bytes) that still

// allows the given function to achieve an occupancy of NWaves waves per

// SIMD / EU, taking into account only the function's *maximum* workgroup size.

unsigned

AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,

                                                 const Function &F) const {

  const unsigned WaveSize = getWavefrontSize();

  const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;

  const unsigned WavesPerWorkgroup =

      std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);


  const unsigned WorkGroupsPerCU =

      std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);


  return getLocalMemorySize() / WorkGroupsPerCU;

}


// FIXME: Should return min,max range.

//

// Returns the maximum occupancy, in number of waves per SIMD / EU, that can

// be achieved when only the given function is running on the machine; and

// taking into account the overall number of wave slots, the (maximum) workgroup

// size, and the per-workgroup LDS allocation size.

unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,

  const Function &F) const {

  const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;

  const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);

  if (!MaxWorkGroupsPerCu)

    return 0;


  const unsigned WaveSize = getWavefrontSize();


  // FIXME: Do we need to account for alignment requirement of LDS rounding the

  // size up?

  // Compute restriction based on LDS usage

  unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);


  // This can be queried with more LDS than is possible, so just assume the

  // worst.

  if (NumGroups == 0)

    return 1;


  NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);


  // Round to the number of waves per CU.

  const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);

  unsigned MaxWaves = NumGroups * MaxGroupNumWaves;


  // Number of waves per EU (SIMD).

  MaxWaves = divideCeil(MaxWaves, getEUsPerCU());


  // Clamp to the maximum possible number of waves.

  MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());


  // FIXME: Needs to be a multiple of the group size?

  //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);


  assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&

         "computed invalid occupancy");

  return MaxWaves;

}


unsigned

AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {

  const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();

  return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());

}


std::pair<unsigned, unsigned>

AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {

  switch (CC) {

  case CallingConv::AMDGPU_VS:

  case CallingConv::AMDGPU_LS:

  case CallingConv::AMDGPU_HS:

  case CallingConv::AMDGPU_ES:

  case CallingConv::AMDGPU_GS:

  case CallingConv::AMDGPU_PS:

    return std::pair(1, getWavefrontSize());

  default:

    return std::pair(1u, getMaxFlatWorkGroupSize());

  }

}


std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(

  const Function &F) const {

  // Default minimum/maximum flat work group sizes.

  std::pair<unsigned, unsigned> Default =

    getDefaultFlatWorkGroupSize(F.getCallingConv());


  // Requested minimum/maximum flat work group sizes.

  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(

    F, "amdgpu-flat-work-group-size", Default);


  // Make sure requested minimum is less than requested maximum.

  if (Requested.first > Requested.second)

    return Default;


  // Make sure requested values do not violate subtarget's specifications.

  if (Requested.first < getMinFlatWorkGroupSize())

    return Default;

  if (Requested.second > getMaxFlatWorkGroupSize())

    return Default;


  return Requested;

}


std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(

    std::pair<unsigned, unsigned> Requested,

    std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {

  // Default minimum/maximum number of waves per execution unit.

  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());


  // If minimum/maximum flat work group sizes were explicitly requested using

  // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum

  // number of waves per execution unit to values implied by requested

  // minimum/maximum flat work group sizes.

  unsigned MinImpliedByFlatWorkGroupSize =

    getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);

  Default.first = MinImpliedByFlatWorkGroupSize;


  // Make sure requested minimum is less than requested maximum.

  if (Requested.second && Requested.first > Requested.second)

    return Default;


  // Make sure requested values do not violate subtarget's specifications.

  if (Requested.first < getMinWavesPerEU() ||

      Requested.second > getMaxWavesPerEU())

    return Default;


  // Make sure requested values are compatible with values implied by requested

  // minimum/maximum flat work group sizes.

  if (Requested.first < MinImpliedByFlatWorkGroupSize)

    return Default;


  return Requested;

}


std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(

    const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {

  // Default minimum/maximum number of waves per execution unit.

  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());


  // Requested minimum/maximum number of waves per execution unit.

  std::pair<unsigned, unsigned> Requested =

      AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);

  return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);

}


static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {

  auto Node = Kernel.getMetadata("reqd_work_group_size");

  if (Node && Node->getNumOperands() == 3)

    return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();

  return std::numeric_limits<unsigned>::max();

}


bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {

  return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());

}


unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,

                                           unsigned Dimension) const {

  unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);

  if (ReqdSize != std::numeric_limits<unsigned>::max())

    return ReqdSize - 1;

  return getFlatWorkGroupSizes(Kernel).second - 1;

}


bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {

  for (int I = 0; I < 3; ++I) {

    if (getMaxWorkitemID(Func, I) > 0)

      return false;

  }


  return true;

}


bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {

  Function *Kernel = I->getParent()->getParent();

  unsigned MinSize = 0;

  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;

  bool IdQuery = false;


  // If reqd_work_group_size is present it narrows value down.

  if (auto *CI = dyn_cast<CallInst>(I)) {

    const Function *F = CI->getCalledFunction();

    if (F) {

      unsigned Dim = UINT_MAX;

      switch (F->getIntrinsicID()) {

      case Intrinsic::amdgcn_workitem_id_x:

      case Intrinsic::r600_read_tidig_x:

        IdQuery = true;

        [[fallthrough]];

      case Intrinsic::r600_read_local_size_x:

        Dim = 0;

        break;

      case Intrinsic::amdgcn_workitem_id_y:

      case Intrinsic::r600_read_tidig_y:

        IdQuery = true;

        [[fallthrough]];

      case Intrinsic::r600_read_local_size_y:

        Dim = 1;

        break;

      case Intrinsic::amdgcn_workitem_id_z:

      case Intrinsic::r600_read_tidig_z:

        IdQuery = true;

        [[fallthrough]];

      case Intrinsic::r600_read_local_size_z:

        Dim = 2;

        break;

      default:

        break;

      }


      if (Dim <= 3) {

        unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);

        if (ReqdSize != std::numeric_limits<unsigned>::max())

          MinSize = MaxSize = ReqdSize;

      }

    }

  }


  if (!MaxSize)

    return false;


  // Range metadata is [Lo, Hi). For ID query we need to pass max size

  // as Hi. For size query we need to pass Hi + 1.

  if (IdQuery)

    MinSize = 0;

  else

    ++MaxSize;


  MDBuilder MDB(I->getContext());

  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),

                                                  APInt(32, MaxSize));

  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);

  return true;

}


unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {

  assert(AMDGPU::isKernel(F.getCallingConv()));


  // We don't allocate the segment if we know the implicit arguments weren't

  // used, even if the ABI implies we need them.

  if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))

    return 0;


  if (isMesaKernel(F))

    return 16;


  // Assume all implicit inputs are used by default

  const Module *M = F.getParent();

  unsigned NBytes =

      AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;

  return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",

                                         NBytes);

}


uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,

                                                 Align &MaxAlign) const {

  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||

         F.getCallingConv() == CallingConv::SPIR_KERNEL);


  const DataLayout &DL = F.getParent()->getDataLayout();

  uint64_t ExplicitArgBytes = 0;

  MaxAlign = Align(1);


  for (const Argument &Arg : F.args()) {

    const bool IsByRef = Arg.hasByRefAttr();

    Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();

    Align Alignment = DL.getValueOrABITypeAlignment(

        IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);

    uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);

    ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;

    MaxAlign = std::max(MaxAlign, Alignment);

  }


  return ExplicitArgBytes;

}


unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,

                                                Align &MaxAlign) const {

  if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&

      F.getCallingConv() != CallingConv::SPIR_KERNEL)

    return 0;


  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);


  unsigned ExplicitOffset = getExplicitKernelArgOffset();


  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;

  unsigned ImplicitBytes = getImplicitArgNumBytes(F);

  if (ImplicitBytes != 0) {

    const Align Alignment = getAlignmentForImplicitArgPtr();

    TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;

    MaxAlign = std::max(MaxAlign, Alignment);

  }


  // Being able to dereference past the end is useful for emitting scalar loads.

  return alignTo(TotalSize, 4);

}


AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {

  return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32

                                  : AMDGPUDwarfFlavour::Wave64;

}


void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,

                                      unsigned NumRegionInstrs) const {

  // Track register pressure so the scheduler can try to decrease

  // pressure once register usage is above the threshold defined by

  // SIRegisterInfo::getRegPressureSetLimit()

  Policy.ShouldTrackPressure = true;


  // Enabling both top down and bottom up scheduling seems to give us less

  // register spills than just using one of these approaches on its own.

  Policy.OnlyTopDown = false;

  Policy.OnlyBottomUp = false;


  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.

  if (!enableSIScheduler())

    Policy.ShouldTrackLaneMasks = true;

}


void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {

  if (isWave32()) {

    // Fix implicit $vcc operands after MIParser has verified that they match

    // the instruction definitions.

    for (auto &MBB : MF) {

      for (auto &MI : MBB)

        InstrInfo.fixImplicitOperands(MI);

    }

  }

}


bool GCNSubtarget::hasMadF16() const {

  return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;

}


bool GCNSubtarget::useVGPRIndexMode() const {

  return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());

}


bool GCNSubtarget::useAA() const { return UseAA; }


unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {

  if (getGeneration() >= AMDGPUSubtarget::GFX10)

    return getMaxWavesPerEU();


  if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {

    if (SGPRs <= 80)

      return 10;

    if (SGPRs <= 88)

      return 9;

    if (SGPRs <= 100)

      return 8;

    return 7;

  }

  if (SGPRs <= 48)

    return 10;

  if (SGPRs <= 56)

    return 9;

  if (SGPRs <= 64)

    return 8;

  if (SGPRs <= 72)

    return 7;

  if (SGPRs <= 80)

    return 6;

  return 5;

}


unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {

  return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);

}


unsigned

GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {

  if (getGeneration() >= AMDGPUSubtarget::GFX10)

    return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.


  if (HasFlatScratch || HasArchitectedFlatScratch) {

    if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)

      return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).

    if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)

      return 4; // FLAT_SCRATCH, VCC (in that order).

  }


  if (isXNACKEnabled())

    return 4; // XNACK, VCC (in that order).

  return 2; // VCC.

}


unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {

  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();

  return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit());

}


unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {

  // In principle we do not need to reserve SGPR pair used for flat_scratch if

  // we know flat instructions do not access the stack anywhere in the

  // program. For now assume it's needed if we have flat instructions.

  const bool KernelUsesFlatScratch = hasFlatAddressSpace();

  return getBaseReservedNumSGPRs(KernelUsesFlatScratch);

}


unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,

                                        unsigned NumSGPRs,

                                        unsigned NumVGPRs) const {

  unsigned Occupancy =

    std::min(getMaxWavesPerEU(),

             getOccupancyWithLocalMemSize(LDSSize, F));

  if (NumSGPRs)

    Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));

  if (NumVGPRs)

    Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));

  return Occupancy;

}


unsigned GCNSubtarget::getBaseMaxNumSGPRs(

    const Function &F, std::pair<unsigned, unsigned> WavesPerEU,

    unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {

  // Compute maximum number of SGPRs function can use using default/requested

  // minimum number of waves per execution unit.

  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);

  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);


  // Check if maximum number of SGPRs was explicitly requested using

  // "amdgpu-num-sgpr" attribute.

  if (F.hasFnAttribute("amdgpu-num-sgpr")) {

    unsigned Requested =

        F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);


    // Make sure requested value does not violate subtarget's specifications.

    if (Requested && (Requested <= ReservedNumSGPRs))

      Requested = 0;


    // If more SGPRs are required to support the input user/system SGPRs,

    // increase to accommodate them.

    //

    // FIXME: This really ends up using the requested number of SGPRs + number

    // of reserved special registers in total. Theoretically you could re-use

    // the last input registers for these special registers, but this would

    // require a lot of complexity to deal with the weird aliasing.

    unsigned InputNumSGPRs = PreloadedSGPRs;

    if (Requested && Requested < InputNumSGPRs)

      Requested = InputNumSGPRs;


    // Make sure requested value is compatible with values implied by

    // default/requested minimum/maximum number of waves per execution unit.

    if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))

      Requested = 0;

    if (WavesPerEU.second &&

        Requested && Requested < getMinNumSGPRs(WavesPerEU.second))

      Requested = 0;


    if (Requested)

      MaxNumSGPRs = Requested;

  }


  if (hasSGPRInitBug())

    MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;


  return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);

}


unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {

  const Function &F = MF.getFunction();

  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();

  return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),

                            getReservedNumSGPRs(MF));

}


static unsigned getMaxNumPreloadedSGPRs() {

  using USI = GCNUserSGPRUsageInfo;

  // Max number of user SGPRs

  const unsigned MaxUserSGPRs =

      USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +

      USI::getNumUserSGPRForField(USI::DispatchPtrID) +

      USI::getNumUserSGPRForField(USI::QueuePtrID) +

      USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +

      USI::getNumUserSGPRForField(USI::DispatchIdID) +

      USI::getNumUserSGPRForField(USI::FlatScratchInitID) +

      USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);


  // Max number of system SGPRs

  const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX

                                  1 + // WorkGroupIDY

                                  1 + // WorkGroupIDZ

                                  1 + // WorkGroupInfo

                                  1;  // private segment wave byte offset


  // Max number of synthetic SGPRs

  const unsigned SyntheticSGPRs = 1; // LDSKernelId


  return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;

}


unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {

  return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),

                            getReservedNumSGPRs(F));

}


unsigned GCNSubtarget::getBaseMaxNumVGPRs(

    const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {

  // Compute maximum number of VGPRs function can use using default/requested

  // minimum number of waves per execution unit.

  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);


  // Check if maximum number of VGPRs was explicitly requested using

  // "amdgpu-num-vgpr" attribute.

  if (F.hasFnAttribute("amdgpu-num-vgpr")) {

    unsigned Requested =

        F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);


    if (hasGFX90AInsts())

      Requested *= 2;


    // Make sure requested value is compatible with values implied by

    // default/requested minimum/maximum number of waves per execution unit.

    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))

      Requested = 0;

    if (WavesPerEU.second &&

        Requested && Requested < getMinNumVGPRs(WavesPerEU.second))

      Requested = 0;


    if (Requested)

      MaxNumVGPRs = Requested;

  }


  return MaxNumVGPRs;

}


unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {

  return getBaseMaxNumVGPRs(F, getWavesPerEU(F));

}


unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {

  const Function &F = MF.getFunction();

  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();

  return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());

}


void GCNSubtarget::adjustSchedDependency(

    SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,

    const TargetSchedModel *SchedModel) const {

  if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||

      !Def->isInstr() || !Use->isInstr())

    return;


  MachineInstr *DefI = Def->getInstr();

  MachineInstr *UseI = Use->getInstr();


  if (DefI->isBundle()) {

    const SIRegisterInfo *TRI = getRegisterInfo();

    auto Reg = Dep.getReg();

    MachineBasicBlock::const_instr_iterator I(DefI->getIterator());

    MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());

    unsigned Lat = 0;

    for (++I; I != E && I->isBundledWithPred(); ++I) {

      if (I->modifiesRegister(Reg, TRI))

        Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);

      else if (Lat)

        --Lat;

    }

    Dep.setLatency(Lat);

  } else if (UseI->isBundle()) {

    const SIRegisterInfo *TRI = getRegisterInfo();

    auto Reg = Dep.getReg();

    MachineBasicBlock::const_instr_iterator I(UseI->getIterator());

    MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());

    unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);

    for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {

      if (I->readsRegister(Reg, TRI))

        break;

      --Lat;

    }

    Dep.setLatency(Lat);

  } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {

    // Work around the fact that SIInstrInfo::fixImplicitOperands modifies

    // implicit operands which come from the MCInstrDesc, which can fool

    // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit

    // pseudo operands.

    Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(

        DefI, DefOpIdx, UseI, UseOpIdx));

  }

}


namespace {

struct FillMFMAShadowMutation : ScheduleDAGMutation {

  const SIInstrInfo *TII;


  ScheduleDAGMI *DAG;


  FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}


  bool isSALU(const SUnit *SU) const {

    const MachineInstr *MI = SU->getInstr();

    return MI && TII->isSALU(*MI) && !MI->isTerminator();

  }


  bool isVALU(const SUnit *SU) const {

    const MachineInstr *MI = SU->getInstr();

    return MI && TII->isVALU(*MI);

  }


  // Link as many SALU instructions in chain as possible. Return the size

  // of the chain. Links up to MaxChain instructions.

  unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,

                         SmallPtrSetImpl<SUnit *> &Visited) const {

    SmallVector<SUnit *, 8> Worklist({To});

    unsigned Linked = 0;


    while (!Worklist.empty() && MaxChain-- > 0) {

      SUnit *SU = Worklist.pop_back_val();

      if (!Visited.insert(SU).second)

        continue;


      LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);

                 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');


      if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From))

        if (DAG->addEdge(SU, SDep(From, SDep::Artificial)))

          ++Linked;


      for (SDep &SI : From->Succs) {

        SUnit *SUv = SI.getSUnit();

        if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) &&

            DAG->canAddEdge(SUv, SU))

          DAG->addEdge(SUv, SDep(SU, SDep::Artificial));

      }


      for (SDep &SI : SU->Succs) {

        SUnit *Succ = SI.getSUnit();

        if (Succ != SU && isSALU(Succ))

          Worklist.push_back(Succ);

      }

    }


    return Linked;

  }


  void apply(ScheduleDAGInstrs *DAGInstrs) override {

    const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();

    if (!ST.hasMAIInsts())

      return;

    DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);

    const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();

    if (!TSchedModel || DAG->SUnits.empty())

      return;


    // Scan for MFMA long latency instructions and try to add a dependency

    // of available SALU instructions to give them a chance to fill MFMA

    // shadow. That is desirable to fill MFMA shadow with SALU instructions

    // rather than VALU to prevent power consumption bursts and throttle.

    auto LastSALU = DAG->SUnits.begin();

    auto E = DAG->SUnits.end();

    SmallPtrSet<SUnit*, 32> Visited;

    for (SUnit &SU : DAG->SUnits) {

      MachineInstr &MAI = *SU.getInstr();

      if (!TII->isMAI(MAI) ||

           MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||

           MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)

        continue;


      unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;


      LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);

                 dbgs() << "Need " << Lat

                        << " instructions to cover latency.\n");


      // Find up to Lat independent scalar instructions as early as

      // possible such that they can be scheduled after this MFMA.

      for ( ; Lat && LastSALU != E; ++LastSALU) {

        if (Visited.count(&*LastSALU))

          continue;


        if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) ||

            !DAG->canAddEdge(&*LastSALU, &SU))

          continue;


        Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);

      }

    }

  }

};

} // namespace


void GCNSubtarget::getPostRAMutations(

    std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {

  Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));

}


std::unique_ptr<ScheduleDAGMutation>

GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {

  return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo)

                          : nullptr;

}


unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {

  if (getGeneration() >= AMDGPUSubtarget::GFX12)

    return 0; // Not MIMG encoding.


  if (NSAThreshold.getNumOccurrences() > 0)

    return std::max(NSAThreshold.getValue(), 2u);


  int Value = MF.getFunction().getFnAttributeAsParsedInteger(

      "amdgpu-nsa-threshold", -1);

  if (Value > 0)

    return std::max(Value, 2);


  return 3;

}


const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {

  if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)

    return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());

  else

    return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());

}


const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {

  if (TM.getTargetTriple().getArch() == Triple::amdgcn)

    return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));

  else

    return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));

}


GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,

                                           const GCNSubtarget &ST)

    : ST(ST) {

  const CallingConv::ID CC = F.getCallingConv();

  const bool IsKernel =

      CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;

  // FIXME: Should have analysis or something rather than attribute to detect

  // calls.

  const bool HasCalls = F.hasFnAttribute("amdgpu-calls");

  // FIXME: This attribute is a hack, we just need an analysis on the function

  // to look for allocas.

  const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");


  if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))

    KernargSegmentPtr = true;


  bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);

  if (IsAmdHsaOrMesa && !ST.enableFlatScratch())

    PrivateSegmentBuffer = true;

  else if (ST.isMesaGfxShader(F))

    ImplicitBufferPtr = true;


  if (!AMDGPU::isGraphics(CC)) {

    if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))

      DispatchPtr = true;


    // FIXME: Can this always be disabled with < COv5?

    if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))

      QueuePtr = true;


    if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))

      DispatchID = true;

  }


  // TODO: This could be refined a lot. The attribute is a poor way of

  // detecting calls or stack objects that may require it before argument

  // lowering.

  if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&

      (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&

      (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&

      !ST.flatScratchIsArchitected()) {

    FlatScratchInit = true;

  }


  if (hasImplicitBufferPtr())

    NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);


  if (hasPrivateSegmentBuffer())

    NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);


  if (hasDispatchPtr())

    NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);


  if (hasQueuePtr())

    NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);


  if (hasKernargSegmentPtr())

    NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);


  if (hasDispatchID())

    NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);


  if (hasFlatScratchInit())

    NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);

}


void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {

  assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));

  NumKernargPreloadSGPRs += NumSGPRs;

  NumUsedUserSGPRs += NumSGPRs;

}


unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {

  return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;

}


SmallVector<unsigned>

AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {

  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3);

}

HasCalls
@ HasCalls
Definition: AArch64InstrInfo.cpp:8189

MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:72

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:74

UseAA
static cl::opt< bool > UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen."))

AMDGPUBaseInfo.h

AMDGPUCallLowering.h
This file describes how to lower LLVM calls to machine code calls.

AMDGPUInstructionSelector.h
This file declares the targeting of the InstructionSelector class for AMDGPU.

AMDGPULegalizerInfo.h
This file declares the targeting of the Machinelegalizer class for AMDGPU.

AMDGPURegisterBankInfo.h
This file declares the targeting of the RegisterBankInfo class for AMDGPU.

EnableVGPRIndexMode
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))

NSAThreshold
static cl::opt< unsigned > NSAThreshold("amdgpu-nsa-threshold", cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(3), cl::Hidden)

EnablePowerSched
static cl::opt< bool > EnablePowerSched("amdgpu-enable-power-sched", cl::desc("Enable scheduling to minimize mAI power bursts"), cl::init(false))

getMaxNumPreloadedSGPRs
static unsigned getMaxNumPreloadedSGPRs()
Definition: AMDGPUSubtarget.cpp:793

UseAA
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))

getReqdWorkGroupSize
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim)
Definition: AMDGPUSubtarget.cpp:470

AMDGPUSubtarget.h
Base class for AMDGPU specific classes of TargetSubtarget.

AMDGPUTargetMachine.h
The AMDGPU TargetMachine interface definition for hw codegen targets.

From
BlockVerifier::State From
Definition: BlockVerifier.cpp:57

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125

MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:113

InlineAsmLowering.h
This file describes how to lower LLVM inline asm to machine code INLINEASM.

MCSubtargetInfo.h

F
#define F(x, y, z)
Definition: MD5.cpp:55

I
#define I(x, y, z)
Definition: MD5.cpp:58

MDBuilder.h

MachineScheduler.h

TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1875

test
modulo schedule test
Definition: ModuloSchedule.cpp:2138

if
if(VerifyEach)
Definition: PassBuilderBindings.cpp:71

TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47

R600Subtarget.h
AMDGPU R600 specific subclass of TargetSubtarget.

CC
auto CC
Definition: RISCVRedundantCopyElimination.cpp:79

assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

SIMachineFunctionInfo.h

SmallString.h
This file defines the SmallString class.

TargetFrameLowering.h

AMDGPUGenSubtargetInfo

Node
Definition: ItaniumDemangle.h:161

llvm::AMDGPUCallLowering
Definition: AMDGPUCallLowering.h:26

llvm::AMDGPUInstructionSelector
Definition: AMDGPUInstructionSelector.h:48

llvm::AMDGPULegalizerInfo
Definition: AMDGPULegalizerInfo.h:30

llvm::AMDGPURegisterBankInfo
Definition: AMDGPURegisterBankInfo.h:42

llvm::AMDGPUSubtarget
Definition: AMDGPUSubtarget.h:29

llvm::AMDGPUSubtarget::getOccupancyWithLocalMemSize
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
Definition: AMDGPUSubtarget.cpp:345

llvm::AMDGPUSubtarget::isMesa3DOS
bool isMesa3DOS() const
Definition: AMDGPUSubtarget.h:136

llvm::AMDGPUSubtarget::AMDGPUSubtarget
AMDGPUSubtarget(const Triple &TT)
Definition: AMDGPUSubtarget.cpp:168

llvm::AMDGPUSubtarget::HasFminFmaxLegacy
bool HasFminFmaxLegacy
Definition: AMDGPUSubtarget.h:63

llvm::AMDGPUSubtarget::getDefaultFlatWorkGroupSize
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
Definition: AMDGPUSubtarget.cpp:391

llvm::AMDGPUSubtarget::WavefrontSizeLog2
char WavefrontSizeLog2
Definition: AMDGPUSubtarget.h:71

llvm::AMDGPUSubtarget::EnableRealTrue16Insts
bool EnableRealTrue16Insts
Definition: AMDGPUSubtarget.h:53

llvm::AMDGPUSubtarget::getAlignmentForImplicitArgPtr
Align getAlignmentForImplicitArgPtr() const
Definition: AMDGPUSubtarget.h:242

llvm::AMDGPUSubtarget::getEUsPerCU
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
Definition: AMDGPUSubtarget.h:240

llvm::AMDGPUSubtarget::isMesaKernel
bool isMesaKernel(const Function &F) const
Definition: AMDGPUSubtarget.cpp:477

llvm::AMDGPUSubtarget::getWavesPerEU
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
Definition: AMDGPUSubtarget.h:101

llvm::AMDGPUSubtarget::useRealTrue16Insts
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
Definition: AMDGPUSubtarget.cpp:170

llvm::AMDGPUSubtarget::getMinWavesPerEU
virtual unsigned getMinWavesPerEU() const =0

llvm::AMDGPUSubtarget::getFlatWorkGroupSizes
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
Definition: AMDGPUSubtarget.cpp:405

llvm::AMDGPUSubtarget::GFX10
@ GFX10
Definition: AMDGPUSubtarget.h:41

llvm::AMDGPUSubtarget::GFX9
@ GFX9
Definition: AMDGPUSubtarget.h:40

llvm::AMDGPUSubtarget::GFX12
@ GFX12
Definition: AMDGPUSubtarget.h:43

llvm::AMDGPUSubtarget::INVALID
@ INVALID
Definition: AMDGPUSubtarget.h:32

llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition: AMDGPUSubtarget.h:38

llvm::AMDGPUSubtarget::SOUTHERN_ISLANDS
@ SOUTHERN_ISLANDS
Definition: AMDGPUSubtarget.h:37

llvm::AMDGPUSubtarget::VOLCANIC_ISLANDS
@ VOLCANIC_ISLANDS
Definition: AMDGPUSubtarget.h:39

llvm::AMDGPUSubtarget::EUsPerCU
unsigned EUsPerCU
Definition: AMDGPUSubtarget.h:67

llvm::AMDGPUSubtarget::makeLIDRangeMetadata
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
Definition: AMDGPUSubtarget.cpp:498

llvm::AMDGPUSubtarget::getMaxWorkitemID
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
Definition: AMDGPUSubtarget.cpp:481

llvm::AMDGPUSubtarget::getImplicitArgNumBytes
unsigned getImplicitArgNumBytes(const Function &F) const
Definition: AMDGPUSubtarget.cpp:560

llvm::AMDGPUSubtarget::getLocalMemorySize
unsigned getLocalMemorySize() const
Definition: AMDGPUSubtarget.h:229

llvm::AMDGPUSubtarget::getMaxNumWorkGroups
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
Definition: AMDGPUSubtarget.cpp:1125

llvm::AMDGPUSubtarget::getWavesPerEUForWorkGroup
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0

llvm::AMDGPUSubtarget::getMaxWorkGroupsPerCU
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0

llvm::AMDGPUSubtarget::HasSMulHi
bool HasSMulHi
Definition: AMDGPUSubtarget.h:61

llvm::AMDGPUSubtarget::getKernArgSegmentSize
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
Definition: AMDGPUSubtarget.cpp:601

llvm::AMDGPUSubtarget::hasTrue16BitInsts
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
Definition: AMDGPUSubtarget.h:159

llvm::AMDGPUSubtarget::LocalMemorySize
unsigned LocalMemorySize
Definition: AMDGPUSubtarget.h:69

llvm::AMDGPUSubtarget::MaxWavesPerEU
unsigned MaxWavesPerEU
Definition: AMDGPUSubtarget.h:68

llvm::AMDGPUSubtarget::getAMDGPUDwarfFlavour
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
Definition: AMDGPUSubtarget.cpp:623

llvm::AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
Definition: AMDGPUSubtarget.cpp:326

llvm::AMDGPUSubtarget::getMaxFlatWorkGroupSize
virtual unsigned getMaxFlatWorkGroupSize() const =0

llvm::AMDGPUSubtarget::getExplicitKernelArgOffset
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
Definition: AMDGPUSubtarget.h:248

llvm::AMDGPUSubtarget::getMaxWavesPerEU
unsigned getMaxWavesPerEU() const
Definition: AMDGPUSubtarget.h:285

llvm::AMDGPUSubtarget::getExplicitKernArgSize
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
Definition: AMDGPUSubtarget.cpp:579

llvm::AMDGPUSubtarget::AddressableLocalMemorySize
unsigned AddressableLocalMemorySize
Definition: AMDGPUSubtarget.h:70

llvm::AMDGPUSubtarget::isAmdHsaOS
bool isAmdHsaOS() const
Definition: AMDGPUSubtarget.h:128

llvm::AMDGPUSubtarget::getEffectiveWavesPerEU
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > WavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
Definition: AMDGPUSubtarget.cpp:428

llvm::AMDGPUSubtarget::isSingleLaneExecution
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
Definition: AMDGPUSubtarget.cpp:489

llvm::AMDGPUSubtarget::get
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Definition: AMDGPUSubtarget.cpp:1034

llvm::AMDGPUSubtarget::getWavefrontSize
unsigned getWavefrontSize() const
Definition: AMDGPUSubtarget.h:221

llvm::AMDGPUSubtarget::getMinFlatWorkGroupSize
virtual unsigned getMinFlatWorkGroupSize() const =0

llvm::AMDGPU::IsaInfo::AMDGPUTargetID::setTargetIDFromFeaturesString
void setTargetIDFromFeaturesString(StringRef FS)
Definition: AMDGPUBaseInfo.cpp:754

llvm::AMDGPU::IsaInfo::AMDGPUTargetID::getXnackSetting
TargetIDSetting getXnackSetting() const
Definition: AMDGPUBaseInfo.h:150

llvm::AMDGPU::IsaInfo::AMDGPUTargetID::getSramEccSetting
TargetIDSetting getSramEccSetting() const
Definition: AMDGPUBaseInfo.h:179

llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:76

llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110

llvm::Function
Definition: Function.h:62

llvm::Function::getFnAttributeAsParsedInteger
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:713

llvm::GCNSubtarget
Definition: GCNSubtarget.h:35

llvm::GCNSubtarget::hasFlat
bool hasFlat() const
Definition: GCNSubtarget.h:371

llvm::GCNSubtarget::useVGPRIndexMode
bool useVGPRIndexMode() const
Definition: AMDGPUSubtarget.cpp:660

llvm::GCNSubtarget::mirFileLoaded
void mirFileLoaded(MachineFunction &MF) const override
Definition: AMDGPUSubtarget.cpp:645

llvm::GCNSubtarget::MaxPrivateElementSize
unsigned MaxPrivateElementSize
Definition: GCNSubtarget.h:66

llvm::GCNSubtarget::getMinNumSGPRs
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
Definition: GCNSubtarget.h:1341

llvm::GCNSubtarget::ParseSubtargetFeatures
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)

llvm::GCNSubtarget::hasGFX90AInsts
bool hasGFX90AInsts() const
Definition: GCNSubtarget.h:1173

llvm::GCNSubtarget::computeOccupancy
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
Definition: AMDGPUSubtarget.cpp:726

llvm::GCNSubtarget::getBaseMaxNumVGPRs
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU) const
Definition: AMDGPUSubtarget.cpp:823

llvm::GCNSubtarget::HasVGPRIndexMode
bool HasVGPRIndexMode
Definition: GCNSubtarget.h:122

llvm::GCNSubtarget::getConstantBusLimit
unsigned getConstantBusLimit(unsigned Opcode) const
Definition: AMDGPUSubtarget.cpp:196

llvm::GCNSubtarget::getInstrItineraryData
const InstrItineraryData * getInstrItineraryData() const override
Definition: GCNSubtarget.h:297

llvm::GCNSubtarget::adjustSchedDependency
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
Definition: AMDGPUSubtarget.cpp:863

llvm::GCNSubtarget::hasMadF16
bool hasMadF16() const
Definition: AMDGPUSubtarget.cpp:656

llvm::GCNSubtarget::hasSGPRInitBug
bool hasSGPRInitBug() const
Definition: GCNSubtarget.h:1071

llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:264

llvm::GCNSubtarget::getMaxNumVGPRs
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
Definition: GCNSubtarget.h:1423

llvm::GCNSubtarget::LDSBankCount
int LDSBankCount
Definition: GCNSubtarget.h:65

llvm::GCNSubtarget::getMinNumVGPRs
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
Definition: GCNSubtarget.h:1417

llvm::GCNSubtarget::zeroesHigh16BitsOfDest
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
Definition: AMDGPUSubtarget.cpp:224

llvm::GCNSubtarget::getBaseMaxNumSGPRs
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
Definition: AMDGPUSubtarget.cpp:739

llvm::GCNSubtarget::initializeSubtargetDependencies
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
Definition: AMDGPUSubtarget.cpp:65

llvm::GCNSubtarget::Gen
unsigned Gen
Definition: GCNSubtarget.h:63

llvm::GCNSubtarget::HasMovrel
bool HasMovrel
Definition: GCNSubtarget.h:121

llvm::GCNSubtarget::getTargetLowering
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:260

llvm::GCNSubtarget::getNSAThreshold
unsigned getNSAThreshold(const MachineFunction &MF) const
Definition: AMDGPUSubtarget.cpp:1019

llvm::GCNSubtarget::hasFlatAddressSpace
bool hasFlatAddressSpace() const
Definition: GCNSubtarget.h:605

llvm::GCNSubtarget::getReservedNumSGPRs
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
Definition: AMDGPUSubtarget.cpp:713

llvm::GCNSubtarget::hasMovrel
bool hasMovrel() const
Definition: GCNSubtarget.h:939

llvm::GCNSubtarget::useAA
bool useAA() const override
Definition: AMDGPUSubtarget.cpp:664

llvm::GCNSubtarget::isWave32
bool isWave32() const
Definition: GCNSubtarget.h:1462

llvm::GCNSubtarget::hasVGPRIndexMode
bool hasVGPRIndexMode() const
Definition: GCNSubtarget.h:943

llvm::GCNSubtarget::getOccupancyWithNumVGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
Definition: AMDGPUSubtarget.cpp:692

llvm::GCNSubtarget::HasArchitectedFlatScratch
bool HasArchitectedFlatScratch
Definition: GCNSubtarget.h:196

llvm::GCNSubtarget::createFillMFMAShadowMutation
std::unique_ptr< ScheduleDAGMutation > createFillMFMAShadowMutation(const TargetInstrInfo *TII) const
Definition: AMDGPUSubtarget.cpp:1014

llvm::GCNSubtarget::getOccupancyWithNumSGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
Definition: AMDGPUSubtarget.cpp:666

llvm::GCNSubtarget::getMaxWavesPerEU
unsigned getMaxWavesPerEU() const
Definition: AMDGPUSubtarget.h:285

llvm::GCNSubtarget::getGeneration
Generation getGeneration() const
Definition: GCNSubtarget.h:303

llvm::GCNSubtarget::GCNSubtarget
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
Definition: AMDGPUSubtarget.cpp:174

llvm::GCNSubtarget::getMaxNumSGPRs
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
Definition: GCNSubtarget.h:1347

llvm::GCNSubtarget::isXNACKEnabled
bool isXNACKEnabled() const
Definition: GCNSubtarget.h:591

llvm::GCNSubtarget::FlatForGlobal
bool FlatForGlobal
Definition: GCNSubtarget.h:74

llvm::GCNSubtarget::getBaseReservedNumSGPRs
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
Definition: AMDGPUSubtarget.cpp:697

llvm::GCNSubtarget::enableSIScheduler
bool enableSIScheduler() const
Definition: GCNSubtarget.h:1063

llvm::GCNSubtarget::hasAddr64
bool hasAddr64() const
Definition: GCNSubtarget.h:367

llvm::GCNSubtarget::hasFP64
bool hasFP64() const
Definition: GCNSubtarget.h:347

llvm::GCNSubtarget::overrideSchedPolicy
void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override
Definition: AMDGPUSubtarget.cpp:628

llvm::GCNSubtarget::getPostRAMutations
void getPostRAMutations(std::vector< std::unique_ptr< ScheduleDAGMutation > > &Mutations) const override
Definition: AMDGPUSubtarget.cpp:1008

llvm::GCNSubtarget::~GCNSubtarget
~GCNSubtarget() override

llvm::GCNSubtarget::TargetID
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
Definition: GCNSubtarget.h:62

llvm::GCNTargetMachine
Definition: AMDGPUTargetMachine.h:76

llvm::GCNUserSGPRUsageInfo
Definition: GCNSubtarget.h:1524

llvm::GCNUserSGPRUsageInfo::getNumUserSGPRForField
static unsigned getNumUserSGPRForField(UserSGPRID ID)
Definition: GCNSubtarget.h:1560

llvm::GCNUserSGPRUsageInfo::hasQueuePtr
bool hasQueuePtr() const
Definition: GCNSubtarget.h:1532

llvm::GCNUserSGPRUsageInfo::hasKernargSegmentPtr
bool hasKernargSegmentPtr() const
Definition: GCNSubtarget.h:1534

llvm::GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
Definition: AMDGPUSubtarget.cpp:1114

llvm::GCNUserSGPRUsageInfo::hasDispatchID
bool hasDispatchID() const
Definition: GCNSubtarget.h:1536

llvm::GCNUserSGPRUsageInfo::ImplicitBufferPtrID
@ ImplicitBufferPtrID
Definition: GCNSubtarget.h:1549

llvm::GCNUserSGPRUsageInfo::DispatchIdID
@ DispatchIdID
Definition: GCNSubtarget.h:1554

llvm::GCNUserSGPRUsageInfo::QueuePtrID
@ QueuePtrID
Definition: GCNSubtarget.h:1552

llvm::GCNUserSGPRUsageInfo::DispatchPtrID
@ DispatchPtrID
Definition: GCNSubtarget.h:1551

llvm::GCNUserSGPRUsageInfo::FlatScratchInitID
@ FlatScratchInitID
Definition: GCNSubtarget.h:1555

llvm::GCNUserSGPRUsageInfo::PrivateSegmentBufferID
@ PrivateSegmentBufferID
Definition: GCNSubtarget.h:1550

llvm::GCNUserSGPRUsageInfo::KernargSegmentPtrID
@ KernargSegmentPtrID
Definition: GCNSubtarget.h:1553

llvm::GCNUserSGPRUsageInfo::hasPrivateSegmentBuffer
bool hasPrivateSegmentBuffer() const
Definition: GCNSubtarget.h:1528

llvm::GCNUserSGPRUsageInfo::getNumFreeUserSGPRs
unsigned getNumFreeUserSGPRs()
Definition: AMDGPUSubtarget.cpp:1120

llvm::GCNUserSGPRUsageInfo::hasImplicitBufferPtr
bool hasImplicitBufferPtr() const
Definition: GCNSubtarget.h:1526

llvm::GCNUserSGPRUsageInfo::hasDispatchPtr
bool hasDispatchPtr() const
Definition: GCNSubtarget.h:1530

llvm::GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
Definition: AMDGPUSubtarget.cpp:1048

llvm::GCNUserSGPRUsageInfo::hasFlatScratchInit
bool hasFlatScratchInit() const
Definition: GCNSubtarget.h:1538

llvm::GlobalObject::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
Definition: Value.h:565

llvm::InlineAsmLowering
Definition: InlineAsmLowering.h:28

llvm::Instruction
Definition: Instruction.h:49

llvm::Legalizer
Definition: Legalizer.h:37

llvm::MDBuilder
Definition: MDBuilder.h:36

llvm::MDBuilder::createRange
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:94

llvm::MDNode
Metadata node.
Definition: Metadata.h:1067

llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition: MachineBasicBlock.h:315

llvm::MachineBasicBlock::const_instr_iterator
Instructions::const_iterator const_instr_iterator
Definition: MachineBasicBlock.h:289

llvm::MachineFunction
Definition: MachineFunction.h:259

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:718

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:684

llvm::MachineFunction::getTarget
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Definition: MachineFunction.h:714

llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:816

llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:69

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:546

llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:329

llvm::MachineInstr::isBundle
bool isBundle() const
Definition: MachineInstr.h:1390

llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65

llvm::R600Subtarget
Definition: R600Subtarget.h:30

llvm::SDep
Scheduling dependency.
Definition: ScheduleDAG.h:49

llvm::SDep::getKind
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:486

llvm::SDep::Data
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53

llvm::SDep::setLatency
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147

llvm::SDep::Artificial
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72

llvm::SDep::getLatency
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142

llvm::SDep::getReg
unsigned getReg() const
Returns the register associated with this edge.
Definition: ScheduleDAG.h:218

llvm::SIInstrInfo
Definition: SIInstrInfo.h:83

llvm::SIInstrInfo::getSchedModel
const TargetSchedModel & getSchedModel() const
Definition: SIInstrInfo.h:1374

llvm::SIInstrInfo::getInstrLatency
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
Definition: SIInstrInfo.cpp:9479

llvm::SIInstrInfo::fixImplicitOperands
void fixImplicitOperands(MachineInstr &MI) const
Definition: SIInstrInfo.cpp:8960

llvm::SIInstrInfo::pseudoToMCOpcode
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
Definition: SIInstrInfo.cpp:9172

llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition: SIMachineFunctionInfo.h:376

llvm::SIMachineFunctionInfo::getNumPreloadedSGPRs
unsigned getNumPreloadedSGPRs() const
Definition: SIMachineFunctionInfo.h:875

llvm::SIMachineFunctionInfo::getWavesPerEU
std::pair< unsigned, unsigned > getWavesPerEU() const
Definition: SIMachineFunctionInfo.h:1026

llvm::SIMachineFunctionInfo::getUserSGPRInfo
GCNUserSGPRUsageInfo & getUserSGPRInfo()
Definition: SIMachineFunctionInfo.h:603

llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:32

llvm::SUnit
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242

llvm::SUnit::Succs
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257

llvm::SUnit::getInstr
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373

llvm::ScheduleDAGInstrs
A ScheduleDAG for scheduling lists of MachineInstr.
Definition: ScheduleDAGInstrs.h:121

llvm::ScheduleDAGInstrs::getSchedModel
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
Definition: ScheduleDAGInstrs.h:276

llvm::ScheduleDAGInstrs::addEdge
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
Definition: ScheduleDAGInstrs.cpp:1222

llvm::ScheduleDAGInstrs::dumpNode
void dumpNode(const SUnit &SU) const override
Definition: ScheduleDAGInstrs.cpp:1178

llvm::ScheduleDAGInstrs::canAddEdge
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
Definition: ScheduleDAGInstrs.cpp:1218

llvm::ScheduleDAGMI
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Definition: MachineScheduler.h:276

llvm::ScheduleDAGMutation
Mutate the DAG as a postpass after normal DAG building.
Definition: ScheduleDAGMutation.h:22

llvm::ScheduleDAG::SUnits
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:561

llvm::ScheduleDAG::MF
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559

llvm::ScheduleDAG::ExitSU
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:563

llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321

llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342

llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427

llvm::SmallString
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50

llvm::TargetFrameLowering
Information about stack frame layout on the target.
Definition: TargetFrameLowering.h:44

llvm::TargetInstrInfo
TargetInstrInfo - Interface to description of machine instruction set.
Definition: TargetInstrInfo.h:110

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76

llvm::TargetMachine::getTargetTriple
const Triple & getTargetTriple() const
Definition: TargetMachine.h:125

llvm::TargetSchedModel
Provide an instruction scheduling machine model to CodeGen passes.
Definition: TargetSchedule.h:30

llvm::TargetSchedModel::computeOperandLatency
unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
Definition: TargetSchedule.cpp:173

llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44

llvm::Triple::AMDHSA
@ AMDHSA
Definition: Triple.h:222

llvm::Triple::amdgcn
@ amdgcn
Definition: Triple.h:74

llvm::Triple::getArch
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:372

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43

llvm::Value
LLVM Value Representation.
Definition: Value.h:74

llvm::cl::opt
Definition: CommandLine.h:1430

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:109

uint32_t

uint64_t

unsigned

llvm::AMDGPU::IsaInfo::getMaxWavesPerEU
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
Definition: AMDGPUBaseInfo.cpp:944

llvm::AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
@ FIXED_NUM_SGPRS_FOR_INIT_BUG
Definition: AMDGPUBaseInfo.h:109

llvm::AMDGPU::IsaInfo::getEUsPerCU
unsigned getEUsPerCU(const MCSubtargetInfo *STI)
Definition: AMDGPUBaseInfo.cpp:910

llvm::AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned NumVGPRs)
Definition: AMDGPUBaseInfo.cpp:1130

llvm::AMDGPU::getIntegerVecAttribute
SmallVector< unsigned > getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size)
Definition: AMDGPUBaseInfo.cpp:1267

llvm::AMDGPU::getMaxNumUserSGPRs
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
Definition: AMDGPUBaseInfo.cpp:2151

llvm::AMDGPU::isEntryFunctionCC
bool isEntryFunctionCC(CallingConv::ID CC)
Definition: AMDGPUBaseInfo.cpp:2071

llvm::AMDGPU::isKernel
LLVM_READNONE bool isKernel(CallingConv::ID CC)
Definition: AMDGPUBaseInfo.h:1250

llvm::AMDGPU::getAMDHSACodeObjectVersion
unsigned getAMDHSACodeObjectVersion(const Module &M)
Definition: AMDGPUBaseInfo.cpp:166

llvm::AMDGPU::AMDHSA_COV5
@ AMDHSA_COV5
Definition: AMDGPUBaseInfo.h:52

llvm::AMDGPU::isShader
bool isShader(CallingConv::ID cc)
Definition: AMDGPUBaseInfo.cpp:2046

llvm::AMDGPU::isGFX10Plus
bool isGFX10Plus(const MCSubtargetInfo &STI)
Definition: AMDGPUBaseInfo.cpp:2197

llvm::AMDGPU::getIntegerPairAttribute
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
Definition: AMDGPUBaseInfo.cpp:1243

llvm::AMDGPU::isGraphics
bool isGraphics(CallingConv::ID cc)
Definition: AMDGPUBaseInfo.cpp:2063

llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73

llvm::CallingConv::AMDGPU_VS
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188

llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200

llvm::CallingConv::AMDGPU_HS
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206

llvm::CallingConv::AMDGPU_GS
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191

llvm::CallingConv::AMDGPU_PS
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194

llvm::CallingConv::SPIR_KERNEL
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144

llvm::CallingConv::AMDGPU_ES
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218

llvm::CallingConv::AMDGPU_LS
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213

llvm::SIEncodingFamily::SI
@ SI
Definition: SIDefines.h:36

llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:138

llvm::cl::apply
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1316

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::divideCeil
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428

llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163

llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155

llvm::InstructionUniformity::Default
@ Default
The result values are uniform if and only if all operands are uniform.

llvm::AMDGPUDwarfFlavour
AMDGPUDwarfFlavour
Definition: AMDGPUMCTargetDesc.h:31

llvm::Wave32
@ Wave32
Definition: AMDGPUMCTargetDesc.h:31

llvm::Wave64
@ Wave64
Definition: AMDGPUMCTargetDesc.h:31

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39

llvm::MachineSchedPolicy
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
Definition: MachineScheduler.h:184

llvm::MachineSchedPolicy::OnlyTopDown
bool OnlyTopDown
Definition: MachineScheduler.h:193

llvm::MachineSchedPolicy::OnlyBottomUp
bool OnlyBottomUp
Definition: MachineScheduler.h:194

llvm::MachineSchedPolicy::ShouldTrackPressure
bool ShouldTrackPressure
Definition: MachineScheduler.h:186

llvm::MachineSchedPolicy::ShouldTrackLaneMasks
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.
Definition: MachineScheduler.h:189

llvm::cl::desc
Definition: CommandLine.h:416