LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUSubtarget.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 191 196 97.4 %
Date: 2018-02-20 16:54:40 Functions: 21 22 95.5 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "AMDGPUSubtarget.h"
      16             : #include "AMDGPU.h"
      17             : #include "AMDGPUTargetMachine.h"
      18             : #include "AMDGPUCallLowering.h"
      19             : #include "AMDGPUInstructionSelector.h"
      20             : #include "AMDGPULegalizerInfo.h"
      21             : #include "AMDGPURegisterBankInfo.h"
      22             : #include "SIMachineFunctionInfo.h"
      23             : #include "llvm/ADT/SmallString.h"
      24             : #include "llvm/CodeGen/MachineScheduler.h"
      25             : #include "llvm/IR/MDBuilder.h"
      26             : #include "llvm/CodeGen/TargetFrameLowering.h"
      27             : #include <algorithm>
      28             : 
      29             : using namespace llvm;
      30             : 
      31             : #define DEBUG_TYPE "amdgpu-subtarget"
      32             : 
      33             : #define GET_SUBTARGETINFO_TARGET_DESC
      34             : #define GET_SUBTARGETINFO_CTOR
      35             : #include "AMDGPUGenSubtargetInfo.inc"
      36             : 
      37             : AMDGPUSubtarget::~AMDGPUSubtarget() = default;
      38             : 
      39             : AMDGPUSubtarget &
      40        2332 : AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
      41             :                                                  StringRef GPU, StringRef FS) {
      42             :   // Determine default and user-specified characteristics
      43             :   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
      44             :   // enabled, but some instructions do not respect them and they run at the
      45             :   // double precision rate, so don't enable by default.
      46             :   //
      47             :   // We want to be able to turn these off, but making this a subtarget feature
      48             :   // for SI has the unhelpful behavior that it unsets everything else if you
      49             :   // disable it.
      50             : 
      51             :   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
      52             : 
      53        2332 :   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
      54             :     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
      55             : 
      56             :   // FIXME: I don't think think Evergreen has any useful support for
      57             :   // denormals, but should be checked. Should we issue a warning somewhere
      58             :   // if someone tries to enable these?
      59        2332 :   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
      60             :     FullFS += "+fp64-fp16-denormals,";
      61             :   } else {
      62             :     FullFS += "-fp32-denormals,";
      63             :   }
      64             : 
      65             :   FullFS += FS;
      66             : 
      67        2332 :   ParseSubtargetFeatures(GPU, FullFS);
      68             : 
      69             :   // We don't support FP64 for EG/NI atm.
      70             :   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
      71             : 
      72             :   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
      73             :   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
      74             :   // variants of MUBUF instructions.
      75        3271 :   if (!hasAddr64() && !FS.contains("flat-for-global")) {
      76         600 :     FlatForGlobal = true;
      77             :   }
      78             : 
      79             :   // Set defaults if needed.
      80        2332 :   if (MaxPrivateElementSize == 0)
      81        2311 :     MaxPrivateElementSize = 4;
      82             : 
      83        2332 :   if (LDSBankCount == 0)
      84         929 :     LDSBankCount = 32;
      85             : 
      86        2332 :   if (TT.getArch() == Triple::amdgcn) {
      87        2031 :     if (LocalMemorySize == 0)
      88         628 :       LocalMemorySize = 32768;
      89             : 
      90             :     // Do something sensible for unspecified target.
      91        2031 :     if (!HasMovrel && !HasVGPRIndexMode)
      92         628 :       HasMovrel = true;
      93             :   }
      94             : 
      95        2332 :   return *this;
      96             : }
      97             : 
      98        2332 : AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
      99        2332 :                                  const TargetMachine &TM)
     100             :   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
     101             :     TargetTriple(TT),
     102        2332 :     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
     103             :     IsaVersion(ISAVersion0_0_0),
     104             :     WavefrontSize(0),
     105             :     LocalMemorySize(0),
     106             :     LDSBankCount(0),
     107             :     MaxPrivateElementSize(0),
     108             : 
     109             :     FastFMAF32(false),
     110             :     HalfRate64Ops(false),
     111             : 
     112             :     FP32Denormals(false),
     113             :     FP64FP16Denormals(false),
     114             :     FPExceptions(false),
     115             :     DX10Clamp(false),
     116             :     FlatForGlobal(false),
     117             :     AutoWaitcntBeforeBarrier(false),
     118             :     CodeObjectV3(false),
     119             :     UnalignedScratchAccess(false),
     120             :     UnalignedBufferAccess(false),
     121             : 
     122             :     HasApertureRegs(false),
     123             :     EnableXNACK(false),
     124             :     TrapHandler(false),
     125             :     DebuggerInsertNops(false),
     126             :     DebuggerReserveRegs(false),
     127             :     DebuggerEmitPrologue(false),
     128             : 
     129             :     EnableHugePrivateBuffer(false),
     130             :     EnableVGPRSpilling(false),
     131             :     EnablePromoteAlloca(false),
     132             :     EnableLoadStoreOpt(false),
     133             :     EnableUnsafeDSOffsetFolding(false),
     134             :     EnableSIScheduler(false),
     135             :     DumpCode(false),
     136             : 
     137             :     FP64(false),
     138             :     FMA(false),
     139             :     MIMG_R128(false),
     140             :     IsGCN(false),
     141             :     GCN3Encoding(false),
     142             :     CIInsts(false),
     143             :     GFX9Insts(false),
     144             :     SGPRInitBug(false),
     145             :     HasSMemRealTime(false),
     146             :     Has16BitInsts(false),
     147             :     HasIntClamp(false),
     148             :     HasVOP3PInsts(false),
     149             :     HasMadMixInsts(false),
     150             :     HasMovrel(false),
     151             :     HasVGPRIndexMode(false),
     152             :     HasScalarStores(false),
     153             :     HasInv2PiInlineImm(false),
     154             :     HasSDWA(false),
     155             :     HasSDWAOmod(false),
     156             :     HasSDWAScalar(false),
     157             :     HasSDWASdst(false),
     158             :     HasSDWAMac(false),
     159             :     HasSDWAOutModsVOPC(false),
     160             :     HasDPP(false),
     161             :     FlatAddressSpace(false),
     162             :     FlatInstOffsets(false),
     163             :     FlatGlobalInsts(false),
     164             :     FlatScratchInsts(false),
     165             :     AddNoCarryInsts(false),
     166             :     HasUnpackedD16VMem(false),
     167             : 
     168             :     R600ALUInst(false),
     169             :     CaymanISA(false),
     170             :     CFALUBug(false),
     171             :     HasVertexCache(false),
     172             :     TexVTXClauseSize(0),
     173             :     ScalarizeGlobal(false),
     174             : 
     175             :     FeatureDisable(false),
     176        4664 :     InstrItins(getInstrItineraryForCPU(GPU)) {
     177        4664 :   AS = AMDGPU::getAMDGPUAS(TT);
     178        2332 :   initializeSubtargetDependencies(TT, GPU, FS);
     179        2332 : }
     180             : 
     181       16414 : unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
     182             :   const Function &F) const {
     183       16414 :   if (NWaves == 1)
     184          26 :     return getLocalMemorySize();
     185       16388 :   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
     186             :   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
     187             :   unsigned MaxWaves = getMaxWavesPerEU();
     188       16388 :   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
     189             : }
     190             : 
     191      138404 : unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
     192             :   const Function &F) const {
     193      138404 :   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
     194             :   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
     195      138404 :   unsigned MaxWaves = getMaxWavesPerEU();
     196      138404 :   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
     197      138404 :   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
     198      138404 :   NumWaves = std::min(NumWaves, MaxWaves);
     199      276808 :   NumWaves = std::max(NumWaves, 1u);
     200      138404 :   return NumWaves;
     201             : }
     202             : 
     203             : std::pair<unsigned, unsigned>
     204      209608 : AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
     205             :   switch (CC) {
     206      184298 :   case CallingConv::AMDGPU_CS:
     207             :   case CallingConv::AMDGPU_KERNEL:
     208             :   case CallingConv::SPIR_KERNEL:
     209      184298 :     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
     210       10429 :   case CallingConv::AMDGPU_VS:
     211             :   case CallingConv::AMDGPU_LS:
     212             :   case CallingConv::AMDGPU_HS:
     213             :   case CallingConv::AMDGPU_ES:
     214             :   case CallingConv::AMDGPU_GS:
     215             :   case CallingConv::AMDGPU_PS:
     216       10429 :     return std::make_pair(1, getWavefrontSize());
     217       14881 :   default:
     218       14881 :     return std::make_pair(1, 16 * getWavefrontSize());
     219             :   }
     220             : }
     221             : 
     222      209608 : std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
     223             :   const Function &F) const {
     224             :   // FIXME: 1024 if function.
     225             :   // Default minimum/maximum flat work group sizes.
     226             :   std::pair<unsigned, unsigned> Default =
     227      209608 :     getDefaultFlatWorkGroupSize(F.getCallingConv());
     228             : 
     229             :   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
     230             :   // starts using "amdgpu-flat-work-group-size" attribute.
     231      419216 :   Default.second = AMDGPU::getIntegerAttribute(
     232      209608 :     F, "amdgpu-max-work-group-size", Default.second);
     233      209608 :   Default.first = std::min(Default.first, Default.second);
     234             : 
     235             :   // Requested minimum/maximum flat work group sizes.
     236      419216 :   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
     237      209608 :     F, "amdgpu-flat-work-group-size", Default);
     238             : 
     239             :   // Make sure requested minimum is less than requested maximum.
     240      209608 :   if (Requested.first > Requested.second)
     241           0 :     return Default;
     242             : 
     243             :   // Make sure requested values do not violate subtarget's specifications.
     244      209608 :   if (Requested.first < getMinFlatWorkGroupSize())
     245          93 :     return Default;
     246      209515 :   if (Requested.second > getMaxFlatWorkGroupSize())
     247           0 :     return Default;
     248             : 
     249      209515 :   return Requested;
     250             : }
     251             : 
     252       33479 : std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
     253             :   const Function &F) const {
     254             :   // Default minimum/maximum number of waves per execution unit.
     255             :   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
     256             : 
     257             :   // Default/requested minimum/maximum flat work group sizes.
     258       33479 :   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
     259             : 
     260             :   // If minimum/maximum flat work group sizes were explicitly requested using
     261             :   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
     262             :   // number of waves per execution unit to values implied by requested
     263             :   // minimum/maximum flat work group sizes.
     264             :   unsigned MinImpliedByFlatWorkGroupSize =
     265       33479 :     getMaxWavesPerEU(FlatWorkGroupSizes.second);
     266             :   bool RequestedFlatWorkGroupSize = false;
     267             : 
     268             :   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
     269             :   // starts using "amdgpu-flat-work-group-size" attribute.
     270       66944 :   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
     271             :       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
     272             :     Default.first = MinImpliedByFlatWorkGroupSize;
     273             :     RequestedFlatWorkGroupSize = true;
     274             :   }
     275             : 
     276             :   // Requested minimum/maximum number of waves per execution unit.
     277       66958 :   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
     278       33479 :     F, "amdgpu-waves-per-eu", Default, true);
     279             : 
     280             :   // Make sure requested minimum is less than requested maximum.
     281       33479 :   if (Requested.second && Requested.first > Requested.second)
     282           0 :     return Default;
     283             : 
     284             :   // Make sure requested values do not violate subtarget's specifications.
     285       66958 :   if (Requested.first < getMinWavesPerEU() ||
     286             :       Requested.first > getMaxWavesPerEU())
     287           0 :     return Default;
     288       33479 :   if (Requested.second > getMaxWavesPerEU())
     289           0 :     return Default;
     290             : 
     291             :   // Make sure requested values are compatible with values implied by requested
     292             :   // minimum/maximum flat work group sizes.
     293       33479 :   if (RequestedFlatWorkGroupSize &&
     294             :       Requested.first < MinImpliedByFlatWorkGroupSize)
     295           2 :     return Default;
     296             : 
     297       33477 :   return Requested;
     298             : }
     299             : 
     300        3762 : bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
     301        3762 :   Function *Kernel = I->getParent()->getParent();
     302             :   unsigned MinSize = 0;
     303        3762 :   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
     304             :   bool IdQuery = false;
     305             : 
     306             :   // If reqd_work_group_size is present it narrows value down.
     307             :   if (auto *CI = dyn_cast<CallInst>(I)) {
     308             :     const Function *F = CI->getCalledFunction();
     309             :     if (F) {
     310             :       unsigned Dim = UINT_MAX;
     311        3696 :       switch (F->getIntrinsicID()) {
     312        3013 :       case Intrinsic::amdgcn_workitem_id_x:
     313             :       case Intrinsic::r600_read_tidig_x:
     314             :         IdQuery = true;
     315             :         LLVM_FALLTHROUGH;
     316             :       case Intrinsic::r600_read_local_size_x:
     317             :         Dim = 0;
     318             :         break;
     319         234 :       case Intrinsic::amdgcn_workitem_id_y:
     320             :       case Intrinsic::r600_read_tidig_y:
     321             :         IdQuery = true;
     322             :         LLVM_FALLTHROUGH;
     323             :       case Intrinsic::r600_read_local_size_y:
     324             :         Dim = 1;
     325             :         break;
     326         190 :       case Intrinsic::amdgcn_workitem_id_z:
     327             :       case Intrinsic::r600_read_tidig_z:
     328             :         IdQuery = true;
     329             :         LLVM_FALLTHROUGH;
     330             :       case Intrinsic::r600_read_local_size_z:
     331             :         Dim = 2;
     332             :         break;
     333             :       default:
     334             :         break;
     335             :       }
     336             :       if (Dim <= 3) {
     337        7392 :         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
     338           8 :           if (Node->getNumOperands() == 3)
     339           8 :             MinSize = MaxSize = mdconst::extract<ConstantInt>(
     340             :                                   Node->getOperand(Dim))->getZExtValue();
     341             :       }
     342             :     }
     343             :   }
     344             : 
     345        3762 :   if (!MaxSize)
     346             :     return false;
     347             : 
     348             :   // Range metadata is [Lo, Hi). For ID query we need to pass max size
     349             :   // as Hi. For size query we need to pass Hi + 1.
     350        3762 :   if (IdQuery)
     351             :     MinSize = 0;
     352             :   else
     353         325 :     ++MaxSize;
     354             : 
     355        3762 :   MDBuilder MDB(I->getContext());
     356        7524 :   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
     357       11286 :                                                   APInt(32, MaxSize));
     358        3762 :   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
     359        3762 :   return true;
     360             : }
     361             : 
     362         283 : R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
     363         283 :                              const TargetMachine &TM) :
     364             :   AMDGPUSubtarget(TT, GPU, FS, TM),
     365             :   InstrInfo(*this),
     366             :   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
     367         566 :   TLInfo(TM, *this) {}
     368             : 
     369        2049 : SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     370        2049 :                          const TargetMachine &TM)
     371             :     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
     372             :       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
     373        4098 :       TLInfo(TM, *this) {
     374        2049 :   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
     375        2049 :   Legalizer.reset(new AMDGPULegalizerInfo());
     376             : 
     377        2049 :   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
     378             :   InstSelector.reset(new AMDGPUInstructionSelector(
     379        2049 :       *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
     380        2049 : }
     381             : 
     382       39407 : void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
     383             :                                       unsigned NumRegionInstrs) const {
     384             :   // Track register pressure so the scheduler can try to decrease
     385             :   // pressure once register usage is above the threshold defined by
     386             :   // SIRegisterInfo::getRegPressureSetLimit()
     387       39407 :   Policy.ShouldTrackPressure = true;
     388             : 
     389             :   // Enabling both top down and bottom up scheduling seems to give us less
     390             :   // register spills than just using one of these approaches on its own.
     391       39407 :   Policy.OnlyTopDown = false;
     392       39407 :   Policy.OnlyBottomUp = false;
     393             : 
     394             :   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
     395       39407 :   if (!enableSIScheduler())
     396       39404 :     Policy.ShouldTrackLaneMasks = true;
     397       39407 : }
     398             : 
     399       20093 : bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
     400       39794 :   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
     401             : }
     402             : 
     403        3660 : unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
     404             :                                             unsigned ExplicitArgBytes) const {
     405        3660 :   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
     406        3660 :   if (ImplicitBytes == 0)
     407             :     return ExplicitArgBytes;
     408             : 
     409             :   unsigned Alignment = getAlignmentForImplicitArgPtr();
     410         228 :   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
     411             : }
     412             : 
     413        1035 : unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
     414        1035 :   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
     415         531 :     if (SGPRs <= 80)
     416             :       return 10;
     417          53 :     if (SGPRs <= 88)
     418             :       return 9;
     419          51 :     if (SGPRs <= 100)
     420             :       return 8;
     421          48 :     return 7;
     422             :   }
     423         504 :   if (SGPRs <= 48)
     424             :     return 10;
     425         118 :   if (SGPRs <= 56)
     426             :     return 9;
     427         107 :   if (SGPRs <= 64)
     428             :     return 8;
     429          92 :   if (SGPRs <= 72)
     430             :     return 7;
     431          61 :   if (SGPRs <= 80)
     432             :     return 6;
     433          61 :   return 5;
     434             : }
     435             : 
     436        1035 : unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
     437        1035 :   if (VGPRs <= 24)
     438             :     return 10;
     439         903 :   if (VGPRs <= 28)
     440             :     return 9;
     441         867 :   if (VGPRs <= 32)
     442             :     return 8;
     443         643 :   if (VGPRs <= 36)
     444             :     return 7;
     445         338 :   if (VGPRs <= 40)
     446             :     return 6;
     447         307 :   if (VGPRs <= 48)
     448             :     return 5;
     449         283 :   if (VGPRs <= 64)
     450             :     return 4;
     451         260 :   if (VGPRs <= 84)
     452             :     return 3;
     453         163 :   if (VGPRs <= 128)
     454             :     return 2;
     455         137 :   return 1;
     456             : }
     457             : 
     458      118639 : unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
     459             :   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
     460      118639 :   if (MFI.hasFlatScratchInit()) {
     461        2993 :     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
     462             :       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
     463        1271 :     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
     464             :       return 4; // FLAT_SCRATCH, VCC (in that order).
     465             :   }
     466             : 
     467      115968 :   if (isXNACKEnabled())
     468             :     return 4; // XNACK, VCC (in that order).
     469      115247 :   return 2; // VCC.
     470             : }
     471             : 
     472      118573 : unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
     473      118573 :   const Function &F = MF.getFunction();
     474             :   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
     475             : 
     476             :   // Compute maximum number of SGPRs function can use using default/requested
     477             :   // minimum number of waves per execution unit.
     478             :   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
     479             :   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
     480      118573 :   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
     481             : 
     482             :   // Check if maximum number of SGPRs was explicitly requested using
     483             :   // "amdgpu-num-sgpr" attribute.
     484      118573 :   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
     485         198 :     unsigned Requested = AMDGPU::getIntegerAttribute(
     486          66 :       F, "amdgpu-num-sgpr", MaxNumSGPRs);
     487             : 
     488             :     // Make sure requested value does not violate subtarget's specifications.
     489          66 :     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
     490             :       Requested = 0;
     491             : 
     492             :     // If more SGPRs are required to support the input user/system SGPRs,
     493             :     // increase to accommodate them.
     494             :     //
     495             :     // FIXME: This really ends up using the requested number of SGPRs + number
     496             :     // of reserved special registers in total. Theoretically you could re-use
     497             :     // the last input registers for these special registers, but this would
     498             :     // require a lot of complexity to deal with the weird aliasing.
     499          66 :     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
     500          66 :     if (Requested && Requested < InputNumSGPRs)
     501             :       Requested = InputNumSGPRs;
     502             : 
     503             :     // Make sure requested value is compatible with values implied by
     504             :     // default/requested minimum/maximum number of waves per execution unit.
     505         132 :     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
     506             :       Requested = 0;
     507          66 :     if (WavesPerEU.second &&
     508         132 :         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
     509             :       Requested = 0;
     510             : 
     511          66 :     if (Requested)
     512             :       MaxNumSGPRs = Requested;
     513             :   }
     514             : 
     515      118573 :   if (hasSGPRInitBug())
     516             :     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
     517             : 
     518      237146 :   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
     519      237146 :                   MaxAddressableNumSGPRs);
     520             : }
     521             : 
     522       85997 : unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
     523       85997 :   const Function &F = MF.getFunction();
     524             :   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
     525             : 
     526             :   // Compute maximum number of VGPRs function can use using default/requested
     527             :   // minimum number of waves per execution unit.
     528             :   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
     529             :   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
     530             : 
     531             :   // Check if maximum number of VGPRs was explicitly requested using
     532             :   // "amdgpu-num-vgpr" attribute.
     533       85997 :   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
     534          72 :     unsigned Requested = AMDGPU::getIntegerAttribute(
     535          24 :       F, "amdgpu-num-vgpr", MaxNumVGPRs);
     536             : 
     537             :     // Make sure requested value does not violate subtarget's specifications.
     538          48 :     if (Requested && Requested <= getReservedNumVGPRs(MF))
     539             :       Requested = 0;
     540             : 
     541             :     // Make sure requested value is compatible with values implied by
     542             :     // default/requested minimum/maximum number of waves per execution unit.
     543          48 :     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
     544             :       Requested = 0;
     545          24 :     if (WavesPerEU.second &&
     546          48 :         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
     547             :       Requested = 0;
     548             : 
     549          24 :     if (Requested)
     550             :       MaxNumVGPRs = Requested;
     551             :   }
     552             : 
     553       85997 :   return MaxNumVGPRs - getReservedNumVGPRs(MF);
     554             : }
     555             : 
     556             : namespace {
     557       12993 : struct MemOpClusterMutation : ScheduleDAGMutation {
     558             :   const SIInstrInfo *TII;
     559             : 
     560       12993 :   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
     561             : 
     562       29710 :   void apply(ScheduleDAGInstrs *DAGInstrs) override {
     563             :     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
     564             : 
     565             :     SUnit *SUa = nullptr;
     566             :     // Search for two consequent memory operations and link them
     567             :     // to prevent scheduler from moving them apart.
     568             :     // In DAG pre-process SUnits are in the original order of
     569             :     // the instructions before scheduling.
     570      217199 :     for (SUnit &SU : DAG->SUnits) {
     571      187489 :       MachineInstr &MI2 = *SU.getInstr();
     572      187489 :       if (!MI2.mayLoad() && !MI2.mayStore()) {
     573             :         SUa = nullptr;
     574      126394 :         continue;
     575             :       }
     576       61095 :       if (!SUa) {
     577             :         SUa = &SU;
     578       33188 :         continue;
     579             :       }
     580             : 
     581       27907 :       MachineInstr &MI1 = *SUa->getInstr();
     582       18198 :       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
     583       18626 :           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
     584       16023 :           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
     585        1969 :           (TII->isDS(MI1)   && TII->isDS(MI2))) {
     586       25963 :         SU.addPredBarrier(SUa);
     587             : 
     588      471437 :         for (const SDep &SI : SU.Preds) {
     589      222737 :           if (SI.getSUnit() != SUa)
     590      181221 :             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
     591             :         }
     592             : 
     593       25963 :         if (&SU != &DAG->ExitSU) {
     594      528429 :           for (const SDep &SI : SUa->Succs) {
     595      251233 :             if (SI.getSUnit() != &SU)
     596      209717 :               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
     597             :           }
     598             :         }
     599             :       }
     600             : 
     601             :       SUa = &SU;
     602             :     }
     603       29710 :   }
     604             : };
     605             : } // namespace
     606             : 
     607       12993 : void SISubtarget::getPostRAMutations(
     608             :     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
     609       25986 :   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
     610       12993 : }

Generated by: LCOV version 1.13