LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUSubtarget.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 222 233 95.3 %
Date: 2018-07-13 00:08:38 Functions: 25 28 89.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// Implements the AMDGPU specific subclass of TargetSubtarget.
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "AMDGPUSubtarget.h"
      16             : #include "AMDGPU.h"
      17             : #include "AMDGPUTargetMachine.h"
      18             : #include "AMDGPUCallLowering.h"
      19             : #include "AMDGPUInstructionSelector.h"
      20             : #include "AMDGPULegalizerInfo.h"
      21             : #include "AMDGPURegisterBankInfo.h"
      22             : #include "SIMachineFunctionInfo.h"
      23             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      24             : #include "llvm/ADT/SmallString.h"
      25             : #include "llvm/CodeGen/MachineScheduler.h"
      26             : #include "llvm/MC/MCSubtargetInfo.h"
      27             : #include "llvm/IR/MDBuilder.h"
      28             : #include "llvm/CodeGen/TargetFrameLowering.h"
      29             : #include <algorithm>
      30             : 
      31             : using namespace llvm;
      32             : 
      33             : #define DEBUG_TYPE "amdgpu-subtarget"
      34             : 
      35             : #define GET_SUBTARGETINFO_TARGET_DESC
      36             : #define GET_SUBTARGETINFO_CTOR
      37             : #include "AMDGPUGenSubtargetInfo.inc"
      38             : #define GET_SUBTARGETINFO_TARGET_DESC
      39             : #define GET_SUBTARGETINFO_CTOR
      40             : #include "R600GenSubtargetInfo.inc"
      41             : 
      42             : AMDGPUSubtarget::~AMDGPUSubtarget() = default;
      43             : 
      44             : R600Subtarget &
      45         286 : R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
      46             :                                                StringRef GPU, StringRef FS) {
      47             :   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
      48             :   FullFS += FS;
      49         286 :   ParseSubtargetFeatures(GPU, FullFS);
      50             : 
      51             :   // FIXME: I don't think think Evergreen has any useful support for
      52             :   // denormals, but should be checked. Should we issue a warning somewhere
      53             :   // if someone tries to enable these?
      54         286 :   if (getGeneration() <= R600Subtarget::NORTHERN_ISLANDS) {
      55         286 :     FP32Denormals = false;
      56             :   }
      57             : 
      58         286 :   HasMulU24 = getGeneration() >= EVERGREEN;
      59         286 :   HasMulI24 = hasCaymanISA();
      60             : 
      61         286 :   return *this;
      62             : }
      63             : 
      64             : AMDGPUSubtarget &
      65        2271 : AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
      66             :                                                  StringRef GPU, StringRef FS) {
      67             :   // Determine default and user-specified characteristics
      68             :   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
      69             :   // enabled, but some instructions do not respect them and they run at the
      70             :   // double precision rate, so don't enable by default.
      71             :   //
      72             :   // We want to be able to turn these off, but making this a subtarget feature
      73             :   // for SI has the unhelpful behavior that it unsets everything else if you
      74             :   // disable it.
      75             : 
      76             :   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
      77             : 
      78        2271 :   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
      79             :     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
      80             : 
      81             :   // FIXME: I don't think think Evergreen has any useful support for
      82             :   // denormals, but should be checked. Should we issue a warning somewhere
      83             :   // if someone tries to enable these?
      84        2271 :   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
      85             :     FullFS += "+fp64-fp16-denormals,";
      86             :   } else {
      87             :     FullFS += "-fp32-denormals,";
      88             :   }
      89             : 
      90             :   FullFS += FS;
      91             : 
      92        2271 :   ParseSubtargetFeatures(GPU, FullFS);
      93             : 
      94             :   // We don't support FP64 for EG/NI atm.
      95             :   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
      96             : 
      97             :   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
      98             :   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
      99             :   // variants of MUBUF instructions.
     100        3376 :   if (!hasAddr64() && !FS.contains("flat-for-global")) {
     101         762 :     FlatForGlobal = true;
     102             :   }
     103             : 
     104             :   // Set defaults if needed.
     105        2271 :   if (MaxPrivateElementSize == 0)
     106        2250 :     MaxPrivateElementSize = 4;
     107             : 
     108        2271 :   if (LDSBankCount == 0)
     109         687 :     LDSBankCount = 32;
     110             : 
     111        2271 :   if (TT.getArch() == Triple::amdgcn) {
     112        2252 :     if (LocalMemorySize == 0)
     113         668 :       LocalMemorySize = 32768;
     114             : 
     115             :     // Do something sensible for unspecified target.
     116        2252 :     if (!HasMovrel && !HasVGPRIndexMode)
     117         668 :       HasMovrel = true;
     118             :   }
     119             : 
     120        2271 :   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
     121             : 
     122        2271 :   return *this;
     123             : }
     124             : 
     125        2557 : AMDGPUCommonSubtarget::AMDGPUCommonSubtarget(const Triple &TT,
     126        2557 :                                              const FeatureBitset &FeatureBits) :
     127             :   TargetTriple(TT),
     128             :   SubtargetFeatureBits(FeatureBits),
     129             :   Has16BitInsts(false),
     130             :   HasMadMixInsts(false),
     131             :   FP32Denormals(false),
     132             :   FPExceptions(false),
     133             :   HasSDWA(false),
     134             :   HasVOP3PInsts(false),
     135             :   HasMulI24(true),
     136             :   HasMulU24(true),
     137             :   HasFminFmaxLegacy(true),
     138             :   EnablePromoteAlloca(false),
     139             :   LocalMemorySize(0),
     140        2557 :   WavefrontSize(0)
     141        2557 :   { }
     142             : 
     143        2271 : AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     144        2271 :                                  const TargetMachine &TM) :
     145             :     AMDGPUGenSubtargetInfo(TT, GPU, FS),
     146             :     AMDGPUCommonSubtarget(TT, getFeatureBits()),
     147             :     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
     148             :     TargetTriple(TT),
     149             :     Gen(SOUTHERN_ISLANDS),
     150             :     IsaVersion(ISAVersion0_0_0),
     151             :     LDSBankCount(0),
     152             :     MaxPrivateElementSize(0),
     153             : 
     154             :     FastFMAF32(false),
     155             :     HalfRate64Ops(false),
     156             : 
     157             :     FP64FP16Denormals(false),
     158             :     DX10Clamp(false),
     159             :     FlatForGlobal(false),
     160             :     AutoWaitcntBeforeBarrier(false),
     161             :     CodeObjectV3(false),
     162             :     UnalignedScratchAccess(false),
     163             :     UnalignedBufferAccess(false),
     164             : 
     165             :     HasApertureRegs(false),
     166             :     EnableXNACK(false),
     167             :     TrapHandler(false),
     168             :     DebuggerInsertNops(false),
     169             :     DebuggerEmitPrologue(false),
     170             : 
     171             :     EnableHugePrivateBuffer(false),
     172             :     EnableVGPRSpilling(false),
     173             :     EnableLoadStoreOpt(false),
     174             :     EnableUnsafeDSOffsetFolding(false),
     175             :     EnableSIScheduler(false),
     176             :     EnableDS128(false),
     177             :     DumpCode(false),
     178             : 
     179             :     FP64(false),
     180             :     GCN3Encoding(false),
     181             :     CIInsts(false),
     182             :     GFX9Insts(false),
     183             :     SGPRInitBug(false),
     184             :     HasSMemRealTime(false),
     185             :     HasIntClamp(false),
     186             :     HasFmaMixInsts(false),
     187             :     HasMovrel(false),
     188             :     HasVGPRIndexMode(false),
     189             :     HasScalarStores(false),
     190             :     HasScalarAtomics(false),
     191             :     HasInv2PiInlineImm(false),
     192             :     HasSDWAOmod(false),
     193             :     HasSDWAScalar(false),
     194             :     HasSDWASdst(false),
     195             :     HasSDWAMac(false),
     196             :     HasSDWAOutModsVOPC(false),
     197             :     HasDPP(false),
     198             :     HasDLInsts(false),
     199             :     D16PreservesUnusedBits(false),
     200             :     FlatAddressSpace(false),
     201             :     FlatInstOffsets(false),
     202             :     FlatGlobalInsts(false),
     203             :     FlatScratchInsts(false),
     204             :     AddNoCarryInsts(false),
     205             :     HasUnpackedD16VMem(false),
     206             : 
     207             :     ScalarizeGlobal(false),
     208             : 
     209        6813 :     FeatureDisable(false) {
     210        4542 :   AS = AMDGPU::getAMDGPUAS(TT);
     211        2271 :   initializeSubtargetDependencies(TT, GPU, FS);
     212        2271 : }
     213             : 
     214       17268 : unsigned AMDGPUCommonSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
     215             :   const Function &F) const {
     216       17268 :   if (NWaves == 1)
     217          26 :     return getLocalMemorySize();
     218       17242 :   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
     219       17242 :   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
     220             :   unsigned MaxWaves = getMaxWavesPerEU();
     221       17242 :   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
     222             : }
     223             : 
     224      166437 : unsigned AMDGPUCommonSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
     225             :   const Function &F) const {
     226      166437 :   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
     227      166437 :   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
     228      166437 :   unsigned MaxWaves = getMaxWavesPerEU();
     229      166437 :   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
     230      166437 :   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
     231      166437 :   NumWaves = std::min(NumWaves, MaxWaves);
     232      332874 :   NumWaves = std::max(NumWaves, 1u);
     233      166437 :   return NumWaves;
     234             : }
     235             : 
     236             : unsigned
     237           0 : AMDGPUCommonSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
     238             :   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
     239           0 :   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
     240             : }
     241             : 
     242             : std::pair<unsigned, unsigned>
     243      242762 : AMDGPUCommonSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
     244             :   switch (CC) {
     245      205431 :   case CallingConv::AMDGPU_CS:
     246             :   case CallingConv::AMDGPU_KERNEL:
     247             :   case CallingConv::SPIR_KERNEL:
     248      205431 :     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
     249       17206 :   case CallingConv::AMDGPU_VS:
     250             :   case CallingConv::AMDGPU_LS:
     251             :   case CallingConv::AMDGPU_HS:
     252             :   case CallingConv::AMDGPU_ES:
     253             :   case CallingConv::AMDGPU_GS:
     254             :   case CallingConv::AMDGPU_PS:
     255       17206 :     return std::make_pair(1, getWavefrontSize());
     256       20125 :   default:
     257       20125 :     return std::make_pair(1, 16 * getWavefrontSize());
     258             :   }
     259             : }
     260             : 
     261      242762 : std::pair<unsigned, unsigned> AMDGPUCommonSubtarget::getFlatWorkGroupSizes(
     262             :   const Function &F) const {
     263             :   // FIXME: 1024 if function.
     264             :   // Default minimum/maximum flat work group sizes.
     265             :   std::pair<unsigned, unsigned> Default =
     266      242762 :     getDefaultFlatWorkGroupSize(F.getCallingConv());
     267             : 
     268             :   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
     269             :   // starts using "amdgpu-flat-work-group-size" attribute.
     270      485524 :   Default.second = AMDGPU::getIntegerAttribute(
     271      242762 :     F, "amdgpu-max-work-group-size", Default.second);
     272      242762 :   Default.first = std::min(Default.first, Default.second);
     273             : 
     274             :   // Requested minimum/maximum flat work group sizes.
     275      485524 :   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
     276      242762 :     F, "amdgpu-flat-work-group-size", Default);
     277             : 
     278             :   // Make sure requested minimum is less than requested maximum.
     279      242762 :   if (Requested.first > Requested.second)
     280           0 :     return Default;
     281             : 
     282             :   // Make sure requested values do not violate subtarget's specifications.
     283      485524 :   if (Requested.first < getMinFlatWorkGroupSize())
     284          73 :     return Default;
     285      485378 :   if (Requested.second > getMaxFlatWorkGroupSize())
     286           0 :     return Default;
     287             : 
     288      242689 :   return Requested;
     289             : }
     290             : 
     291       35964 : std::pair<unsigned, unsigned> AMDGPUCommonSubtarget::getWavesPerEU(
     292             :   const Function &F) const {
     293             :   // Default minimum/maximum number of waves per execution unit.
     294             :   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
     295             : 
     296             :   // Default/requested minimum/maximum flat work group sizes.
     297       35964 :   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
     298             : 
     299             :   // If minimum/maximum flat work group sizes were explicitly requested using
     300             :   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
     301             :   // number of waves per execution unit to values implied by requested
     302             :   // minimum/maximum flat work group sizes.
     303             :   unsigned MinImpliedByFlatWorkGroupSize =
     304       35964 :     getMaxWavesPerEU(FlatWorkGroupSizes.second);
     305             :   bool RequestedFlatWorkGroupSize = false;
     306             : 
     307             :   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
     308             :   // starts using "amdgpu-flat-work-group-size" attribute.
     309       71914 :   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
     310             :       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
     311             :     Default.first = MinImpliedByFlatWorkGroupSize;
     312             :     RequestedFlatWorkGroupSize = true;
     313             :   }
     314             : 
     315             :   // Requested minimum/maximum number of waves per execution unit.
     316       71928 :   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
     317       35964 :     F, "amdgpu-waves-per-eu", Default, true);
     318             : 
     319             :   // Make sure requested minimum is less than requested maximum.
     320       35964 :   if (Requested.second && Requested.first > Requested.second)
     321           0 :     return Default;
     322             : 
     323             :   // Make sure requested values do not violate subtarget's specifications.
     324       71928 :   if (Requested.first < getMinWavesPerEU() ||
     325             :       Requested.first > getMaxWavesPerEU())
     326           0 :     return Default;
     327       35964 :   if (Requested.second > getMaxWavesPerEU())
     328           0 :     return Default;
     329             : 
     330             :   // Make sure requested values are compatible with values implied by requested
     331             :   // minimum/maximum flat work group sizes.
     332       35964 :   if (RequestedFlatWorkGroupSize &&
     333             :       Requested.first < MinImpliedByFlatWorkGroupSize)
     334           2 :     return Default;
     335             : 
     336       35962 :   return Requested;
     337             : }
     338             : 
     339        3914 : bool AMDGPUCommonSubtarget::makeLIDRangeMetadata(Instruction *I) const {
     340        3914 :   Function *Kernel = I->getParent()->getParent();
     341             :   unsigned MinSize = 0;
     342        3914 :   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
     343             :   bool IdQuery = false;
     344             : 
     345             :   // If reqd_work_group_size is present it narrows value down.
     346             :   if (auto *CI = dyn_cast<CallInst>(I)) {
     347             :     const Function *F = CI->getCalledFunction();
     348             :     if (F) {
     349             :       unsigned Dim = UINT_MAX;
     350        3848 :       switch (F->getIntrinsicID()) {
     351        3156 :       case Intrinsic::amdgcn_workitem_id_x:
     352             :       case Intrinsic::r600_read_tidig_x:
     353             :         IdQuery = true;
     354             :         LLVM_FALLTHROUGH;
     355             :       case Intrinsic::r600_read_local_size_x:
     356             :         Dim = 0;
     357             :         break;
     358         243 :       case Intrinsic::amdgcn_workitem_id_y:
     359             :       case Intrinsic::r600_read_tidig_y:
     360             :         IdQuery = true;
     361             :         LLVM_FALLTHROUGH;
     362             :       case Intrinsic::r600_read_local_size_y:
     363             :         Dim = 1;
     364             :         break;
     365         190 :       case Intrinsic::amdgcn_workitem_id_z:
     366             :       case Intrinsic::r600_read_tidig_z:
     367             :         IdQuery = true;
     368             :         LLVM_FALLTHROUGH;
     369             :       case Intrinsic::r600_read_local_size_z:
     370             :         Dim = 2;
     371             :         break;
     372             :       default:
     373             :         break;
     374             :       }
     375             :       if (Dim <= 3) {
     376        7696 :         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
     377           8 :           if (Node->getNumOperands() == 3)
     378           8 :             MinSize = MaxSize = mdconst::extract<ConstantInt>(
     379             :                                   Node->getOperand(Dim))->getZExtValue();
     380             :       }
     381             :     }
     382             :   }
     383             : 
     384        3914 :   if (!MaxSize)
     385             :     return false;
     386             : 
     387             :   // Range metadata is [Lo, Hi). For ID query we need to pass max size
     388             :   // as Hi. For size query we need to pass Hi + 1.
     389        3914 :   if (IdQuery)
     390             :     MinSize = 0;
     391             :   else
     392         325 :     ++MaxSize;
     393             : 
     394        3914 :   MDBuilder MDB(I->getContext());
     395        7828 :   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
     396       11742 :                                                   APInt(32, MaxSize));
     397        3914 :   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
     398        3914 :   return true;
     399             : }
     400             : 
     401         286 : R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
     402         286 :                              const TargetMachine &TM) :
     403             :   R600GenSubtargetInfo(TT, GPU, FS),
     404             :   AMDGPUCommonSubtarget(TT, getFeatureBits()),
     405             :   InstrInfo(*this),
     406             :   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
     407             :   FMA(false),
     408             :   CaymanISA(false),
     409             :   CFALUBug(false),
     410             :   DX10Clamp(false),
     411             :   HasVertexCache(false),
     412             :   R600ALUInst(false),
     413             :   FP64(false),
     414             :   TexVTXClauseSize(0),
     415             :   Gen(R600),
     416         286 :   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
     417         286 :   InstrItins(getInstrItineraryForCPU(GPU)),
     418        2002 :   AS (AMDGPU::getAMDGPUAS(TT)) { }
     419             : 
     420        2271 : SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     421        2271 :                          const GCNTargetMachine &TM)
     422             :     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
     423             :       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
     424        4542 :       TLInfo(TM, *this) {
     425        2271 :   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
     426        2271 :   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
     427             : 
     428        2271 :   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
     429             :   InstSelector.reset(new AMDGPUInstructionSelector(
     430        2271 :       *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
     431        2271 : }
     432             : 
     433       42462 : void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
     434             :                                       unsigned NumRegionInstrs) const {
     435             :   // Track register pressure so the scheduler can try to decrease
     436             :   // pressure once register usage is above the threshold defined by
     437             :   // SIRegisterInfo::getRegPressureSetLimit()
     438       42462 :   Policy.ShouldTrackPressure = true;
     439             : 
     440             :   // Enabling both top down and bottom up scheduling seems to give us less
     441             :   // register spills than just using one of these approaches on its own.
     442       42462 :   Policy.OnlyTopDown = false;
     443       42462 :   Policy.OnlyBottomUp = false;
     444             : 
     445             :   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
     446       42462 :   if (!enableSIScheduler())
     447       42459 :     Policy.ShouldTrackLaneMasks = true;
     448       42462 : }
     449             : 
     450       22190 : bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
     451       43982 :   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
     452             : }
     453             : 
     454       16465 : uint64_t SISubtarget::getExplicitKernArgSize(const Function &F) const {
     455             :   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL);
     456             : 
     457       16465 :   const DataLayout &DL = F.getParent()->getDataLayout();
     458             :   uint64_t ExplicitArgBytes = 0;
     459       53729 :   for (const Argument &Arg : F.args()) {
     460       37264 :     Type *ArgTy = Arg.getType();
     461             : 
     462       37264 :     unsigned Align = DL.getABITypeAlignment(ArgTy);
     463       37264 :     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
     464       74528 :     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
     465             :   }
     466             : 
     467       16465 :   return ExplicitArgBytes;
     468             : }
     469             : 
     470       18862 : unsigned SISubtarget::getKernArgSegmentSize(const Function &F,
     471             :                                             int64_t ExplicitArgBytes) const {
     472       18862 :   if (ExplicitArgBytes == -1)
     473       16465 :     ExplicitArgBytes = getExplicitKernArgSize(F);
     474             : 
     475       18862 :   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
     476             : 
     477       18862 :   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
     478       18862 :   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
     479       18862 :   if (ImplicitBytes != 0) {
     480             :     unsigned Alignment = getAlignmentForImplicitArgPtr();
     481        1720 :     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
     482             :   }
     483             : 
     484             :   // Being able to dereference past the end is useful for emitting scalar loads.
     485       18862 :   return alignTo(TotalSize, 4);
     486             : }
     487             : 
     488        1352 : unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
     489        1352 :   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
     490         806 :     if (SGPRs <= 80)
     491             :       return 10;
     492          63 :     if (SGPRs <= 88)
     493             :       return 9;
     494          61 :     if (SGPRs <= 100)
     495             :       return 8;
     496          56 :     return 7;
     497             :   }
     498         546 :   if (SGPRs <= 48)
     499             :     return 10;
     500         135 :   if (SGPRs <= 56)
     501             :     return 9;
     502         120 :   if (SGPRs <= 64)
     503             :     return 8;
     504         105 :   if (SGPRs <= 72)
     505             :     return 7;
     506          69 :   if (SGPRs <= 80)
     507             :     return 6;
     508          69 :   return 5;
     509             : }
     510             : 
     511        1352 : unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
     512        1352 :   if (VGPRs <= 24)
     513             :     return 10;
     514        1020 :   if (VGPRs <= 28)
     515             :     return 9;
     516         993 :   if (VGPRs <= 32)
     517             :     return 8;
     518         761 :   if (VGPRs <= 36)
     519             :     return 7;
     520         388 :   if (VGPRs <= 40)
     521             :     return 6;
     522         360 :   if (VGPRs <= 48)
     523             :     return 5;
     524         348 :   if (VGPRs <= 64)
     525             :     return 4;
     526         322 :   if (VGPRs <= 84)
     527             :     return 3;
     528         201 :   if (VGPRs <= 128)
     529             :     return 2;
     530         174 :   return 1;
     531             : }
     532             : 
     533      127738 : unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
     534             :   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
     535      127738 :   if (MFI.hasFlatScratchInit()) {
     536        3080 :     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
     537             :       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
     538        1285 :     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
     539             :       return 4; // FLAT_SCRATCH, VCC (in that order).
     540             :   }
     541             : 
     542      124964 :   if (isXNACKEnabled())
     543             :     return 4; // XNACK, VCC (in that order).
     544      123880 :   return 2; // VCC.
     545             : }
     546             : 
     547      127678 : unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
     548      127678 :   const Function &F = MF.getFunction();
     549             :   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
     550             : 
     551             :   // Compute maximum number of SGPRs function can use using default/requested
     552             :   // minimum number of waves per execution unit.
     553             :   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
     554             :   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
     555      127678 :   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
     556             : 
     557             :   // Check if maximum number of SGPRs was explicitly requested using
     558             :   // "amdgpu-num-sgpr" attribute.
     559      127678 :   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
     560         180 :     unsigned Requested = AMDGPU::getIntegerAttribute(
     561          60 :       F, "amdgpu-num-sgpr", MaxNumSGPRs);
     562             : 
     563             :     // Make sure requested value does not violate subtarget's specifications.
     564          60 :     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
     565             :       Requested = 0;
     566             : 
     567             :     // If more SGPRs are required to support the input user/system SGPRs,
     568             :     // increase to accommodate them.
     569             :     //
     570             :     // FIXME: This really ends up using the requested number of SGPRs + number
     571             :     // of reserved special registers in total. Theoretically you could re-use
     572             :     // the last input registers for these special registers, but this would
     573             :     // require a lot of complexity to deal with the weird aliasing.
     574          60 :     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
     575          60 :     if (Requested && Requested < InputNumSGPRs)
     576             :       Requested = InputNumSGPRs;
     577             : 
     578             :     // Make sure requested value is compatible with values implied by
     579             :     // default/requested minimum/maximum number of waves per execution unit.
     580         120 :     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
     581             :       Requested = 0;
     582          60 :     if (WavesPerEU.second &&
     583         120 :         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
     584             :       Requested = 0;
     585             : 
     586          60 :     if (Requested)
     587             :       MaxNumSGPRs = Requested;
     588             :   }
     589             : 
     590      127678 :   if (hasSGPRInitBug())
     591             :     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
     592             : 
     593      255356 :   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
     594      255356 :                   MaxAddressableNumSGPRs);
     595             : }
     596             : 
     597       93127 : unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
     598       93127 :   const Function &F = MF.getFunction();
     599             :   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
     600             : 
     601             :   // Compute maximum number of VGPRs function can use using default/requested
     602             :   // minimum number of waves per execution unit.
     603             :   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
     604             :   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
     605             : 
     606             :   // Check if maximum number of VGPRs was explicitly requested using
     607             :   // "amdgpu-num-vgpr" attribute.
     608       93127 :   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
     609          54 :     unsigned Requested = AMDGPU::getIntegerAttribute(
     610          18 :       F, "amdgpu-num-vgpr", MaxNumVGPRs);
     611             : 
     612             :     // Make sure requested value is compatible with values implied by
     613             :     // default/requested minimum/maximum number of waves per execution unit.
     614          36 :     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
     615             :       Requested = 0;
     616          18 :     if (WavesPerEU.second &&
     617          36 :         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
     618             :       Requested = 0;
     619             : 
     620          18 :     if (Requested)
     621             :       MaxNumVGPRs = Requested;
     622             :   }
     623             : 
     624       93127 :   return MaxNumVGPRs;
     625             : }
     626             : 
     627             : namespace {
     628       14125 : struct MemOpClusterMutation : ScheduleDAGMutation {
     629             :   const SIInstrInfo *TII;
     630             : 
     631       14125 :   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
     632             : 
     633       32223 :   void apply(ScheduleDAGInstrs *DAGInstrs) override {
     634             :     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
     635             : 
     636             :     SUnit *SUa = nullptr;
     637             :     // Search for two consequent memory operations and link them
     638             :     // to prevent scheduler from moving them apart.
     639             :     // In DAG pre-process SUnits are in the original order of
     640             :     // the instructions before scheduling.
     641      228320 :     for (SUnit &SU : DAG->SUnits) {
     642      196097 :       MachineInstr &MI2 = *SU.getInstr();
     643      196097 :       if (!MI2.mayLoad() && !MI2.mayStore()) {
     644             :         SUa = nullptr;
     645      138930 :         continue;
     646             :       }
     647       57167 :       if (!SUa) {
     648             :         SUa = &SU;
     649       34677 :         continue;
     650             :       }
     651             : 
     652       22490 :       MachineInstr &MI1 = *SUa->getInstr();
     653       13597 :       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
     654       13978 :           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
     655       10782 :           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
     656        2729 :           (TII->isDS(MI1)   && TII->isDS(MI2))) {
     657       20994 :         SU.addPredBarrier(SUa);
     658             : 
     659      445360 :         for (const SDep &SI : SU.Preds) {
     660      212183 :           if (SI.getSUnit() != SUa)
     661      180155 :             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
     662             :         }
     663             : 
     664       20994 :         if (&SU != &DAG->ExitSU) {
     665      479596 :           for (const SDep &SI : SUa->Succs) {
     666      229301 :             if (SI.getSUnit() != &SU)
     667      197273 :               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
     668             :           }
     669             :         }
     670             :       }
     671             : 
     672             :       SUa = &SU;
     673             :     }
     674       32223 :   }
     675             : };
     676             : } // namespace
     677             : 
     678       14125 : void SISubtarget::getPostRAMutations(
     679             :     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
     680       28250 :   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
     681       14125 : }
     682             : 
     683           0 : const AMDGPUCommonSubtarget &AMDGPUCommonSubtarget::get(const MachineFunction &MF) {
     684           0 :   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
     685           0 :     return static_cast<const AMDGPUCommonSubtarget&>(MF.getSubtarget<AMDGPUSubtarget>());
     686             :   else
     687           0 :     return static_cast<const AMDGPUCommonSubtarget&>(MF.getSubtarget<R600Subtarget>());
     688             : }
     689             : 
     690       43413 : const AMDGPUCommonSubtarget &AMDGPUCommonSubtarget::get(const TargetMachine &TM, const Function &F) {
     691       43413 :   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
     692       38429 :     return static_cast<const AMDGPUCommonSubtarget&>(TM.getSubtarget<AMDGPUSubtarget>(F));
     693             :   else
     694        4984 :     return static_cast<const AMDGPUCommonSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
     695             : }

Generated by: LCOV version 1.13