LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUSubtarget.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 191 196 97.4 %
Date: 2018-05-20 00:06:23 Functions: 21 22 95.5 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// Implements the AMDGPU specific subclass of TargetSubtarget.
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "AMDGPUSubtarget.h"
      16             : #include "AMDGPU.h"
      17             : #include "AMDGPUTargetMachine.h"
      18             : #include "AMDGPUCallLowering.h"
      19             : #include "AMDGPUInstructionSelector.h"
      20             : #include "AMDGPULegalizerInfo.h"
      21             : #include "AMDGPURegisterBankInfo.h"
      22             : #include "SIMachineFunctionInfo.h"
      23             : #include "llvm/ADT/SmallString.h"
      24             : #include "llvm/CodeGen/MachineScheduler.h"
      25             : #include "llvm/IR/MDBuilder.h"
      26             : #include "llvm/CodeGen/TargetFrameLowering.h"
      27             : #include <algorithm>
      28             : 
      29             : using namespace llvm;
      30             : 
      31             : #define DEBUG_TYPE "amdgpu-subtarget"
      32             : 
      33             : #define GET_SUBTARGETINFO_TARGET_DESC
      34             : #define GET_SUBTARGETINFO_CTOR
      35             : #include "AMDGPUGenSubtargetInfo.inc"
      36             : 
      37             : AMDGPUSubtarget::~AMDGPUSubtarget() = default;
      38             : 
      39             : AMDGPUSubtarget &
      40        2489 : AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
      41             :                                                  StringRef GPU, StringRef FS) {
      42             :   // Determine default and user-specified characteristics
      43             :   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
      44             :   // enabled, but some instructions do not respect them and they run at the
      45             :   // double precision rate, so don't enable by default.
      46             :   //
      47             :   // We want to be able to turn these off, but making this a subtarget feature
      48             :   // for SI has the unhelpful behavior that it unsets everything else if you
      49             :   // disable it.
      50             : 
      51             :   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
      52             : 
      53        2489 :   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
      54             :     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
      55             : 
      56             :   // FIXME: I don't think think Evergreen has any useful support for
      57             :   // denormals, but should be checked. Should we issue a warning somewhere
      58             :   // if someone tries to enable these?
      59        2489 :   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
      60             :     FullFS += "+fp64-fp16-denormals,";
      61             :   } else {
      62             :     FullFS += "-fp32-denormals,";
      63             :   }
      64             : 
      65             :   FullFS += FS;
      66             : 
      67        2489 :   ParseSubtargetFeatures(GPU, FullFS);
      68             : 
      69             :   // We don't support FP64 for EG/NI atm.
      70             :   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
      71             : 
      72             :   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
      73             :   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
      74             :   // variants of MUBUF instructions.
      75        3561 :   if (!hasAddr64() && !FS.contains("flat-for-global")) {
      76         732 :     FlatForGlobal = true;
      77             :   }
      78             : 
      79             :   // Set defaults if needed.
      80        2489 :   if (MaxPrivateElementSize == 0)
      81        2468 :     MaxPrivateElementSize = 4;
      82             : 
      83        2489 :   if (LDSBankCount == 0)
      84         950 :     LDSBankCount = 32;
      85             : 
      86        2489 :   if (TT.getArch() == Triple::amdgcn) {
      87        2186 :     if (LocalMemorySize == 0)
      88         647 :       LocalMemorySize = 32768;
      89             : 
      90             :     // Do something sensible for unspecified target.
      91        2186 :     if (!HasMovrel && !HasVGPRIndexMode)
      92         647 :       HasMovrel = true;
      93             :   }
      94             : 
      95        2489 :   return *this;
      96             : }
      97             : 
      98        2489 : AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
      99        2489 :                                  const TargetMachine &TM)
     100             :   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
     101             :     TargetTriple(TT),
     102        2489 :     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
     103             :     IsaVersion(ISAVersion0_0_0),
     104             :     WavefrontSize(0),
     105             :     LocalMemorySize(0),
     106             :     LDSBankCount(0),
     107             :     MaxPrivateElementSize(0),
     108             : 
     109             :     FastFMAF32(false),
     110             :     HalfRate64Ops(false),
     111             : 
     112             :     FP32Denormals(false),
     113             :     FP64FP16Denormals(false),
     114             :     FPExceptions(false),
     115             :     DX10Clamp(false),
     116             :     FlatForGlobal(false),
     117             :     AutoWaitcntBeforeBarrier(false),
     118             :     CodeObjectV3(false),
     119             :     UnalignedScratchAccess(false),
     120             :     UnalignedBufferAccess(false),
     121             : 
     122             :     HasApertureRegs(false),
     123             :     EnableXNACK(false),
     124             :     TrapHandler(false),
     125             :     DebuggerInsertNops(false),
     126             :     DebuggerReserveRegs(false),
     127             :     DebuggerEmitPrologue(false),
     128             : 
     129             :     EnableHugePrivateBuffer(false),
     130             :     EnableVGPRSpilling(false),
     131             :     EnablePromoteAlloca(false),
     132             :     EnableLoadStoreOpt(false),
     133             :     EnableUnsafeDSOffsetFolding(false),
     134             :     EnableSIScheduler(false),
     135             :     EnableDS128(false),
     136             :     DumpCode(false),
     137             : 
     138             :     FP64(false),
     139             :     FMA(false),
     140             :     MIMG_R128(false),
     141             :     IsGCN(false),
     142             :     GCN3Encoding(false),
     143             :     CIInsts(false),
     144             :     GFX9Insts(false),
     145             :     SGPRInitBug(false),
     146             :     HasSMemRealTime(false),
     147             :     Has16BitInsts(false),
     148             :     HasIntClamp(false),
     149             :     HasVOP3PInsts(false),
     150             :     HasMadMixInsts(false),
     151             :     HasFmaMixInsts(false),
     152             :     HasMovrel(false),
     153             :     HasVGPRIndexMode(false),
     154             :     HasScalarStores(false),
     155             :     HasScalarAtomics(false),
     156             :     HasInv2PiInlineImm(false),
     157             :     HasSDWA(false),
     158             :     HasSDWAOmod(false),
     159             :     HasSDWAScalar(false),
     160             :     HasSDWASdst(false),
     161             :     HasSDWAMac(false),
     162             :     HasSDWAOutModsVOPC(false),
     163             :     HasDPP(false),
     164             :     HasDLInsts(false),
     165             :     D16PreservesUnusedBits(false),
     166             :     FlatAddressSpace(false),
     167             :     FlatInstOffsets(false),
     168             :     FlatGlobalInsts(false),
     169             :     FlatScratchInsts(false),
     170             :     AddNoCarryInsts(false),
     171             :     HasUnpackedD16VMem(false),
     172             : 
     173             :     R600ALUInst(false),
     174             :     CaymanISA(false),
     175             :     CFALUBug(false),
     176             :     HasVertexCache(false),
     177             :     TexVTXClauseSize(0),
     178             :     ScalarizeGlobal(false),
     179             : 
     180             :     FeatureDisable(false),
     181        4978 :     InstrItins(getInstrItineraryForCPU(GPU)) {
     182        4978 :   AS = AMDGPU::getAMDGPUAS(TT);
     183        2489 :   initializeSubtargetDependencies(TT, GPU, FS);
     184        2489 : }
     185             : 
     186       16924 : unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
     187             :   const Function &F) const {
     188       16924 :   if (NWaves == 1)
     189          26 :     return getLocalMemorySize();
     190       16898 :   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
     191             :   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
     192             :   unsigned MaxWaves = getMaxWavesPerEU();
     193       16898 :   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
     194             : }
     195             : 
     196      145433 : unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
     197             :   const Function &F) const {
     198      145433 :   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
     199             :   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
     200      145433 :   unsigned MaxWaves = getMaxWavesPerEU();
     201      145433 :   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
     202      145433 :   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
     203      145433 :   NumWaves = std::min(NumWaves, MaxWaves);
     204      290866 :   NumWaves = std::max(NumWaves, 1u);
     205      145433 :   return NumWaves;
     206             : }
     207             : 
     208             : std::pair<unsigned, unsigned>
     209      219962 : AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
     210             :   switch (CC) {
     211      189311 :   case CallingConv::AMDGPU_CS:
     212             :   case CallingConv::AMDGPU_KERNEL:
     213             :   case CallingConv::SPIR_KERNEL:
     214      189311 :     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
     215       13386 :   case CallingConv::AMDGPU_VS:
     216             :   case CallingConv::AMDGPU_LS:
     217             :   case CallingConv::AMDGPU_HS:
     218             :   case CallingConv::AMDGPU_ES:
     219             :   case CallingConv::AMDGPU_GS:
     220             :   case CallingConv::AMDGPU_PS:
     221       13386 :     return std::make_pair(1, getWavefrontSize());
     222       17265 :   default:
     223       17265 :     return std::make_pair(1, 16 * getWavefrontSize());
     224             :   }
     225             : }
     226             : 
     227      219962 : std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
     228             :   const Function &F) const {
     229             :   // FIXME: 1024 if function.
     230             :   // Default minimum/maximum flat work group sizes.
     231             :   std::pair<unsigned, unsigned> Default =
     232      219962 :     getDefaultFlatWorkGroupSize(F.getCallingConv());
     233             : 
     234             :   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
     235             :   // starts using "amdgpu-flat-work-group-size" attribute.
     236      439924 :   Default.second = AMDGPU::getIntegerAttribute(
     237      219962 :     F, "amdgpu-max-work-group-size", Default.second);
     238      219962 :   Default.first = std::min(Default.first, Default.second);
     239             : 
     240             :   // Requested minimum/maximum flat work group sizes.
     241      439924 :   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
     242      219962 :     F, "amdgpu-flat-work-group-size", Default);
     243             : 
     244             :   // Make sure requested minimum is less than requested maximum.
     245      219962 :   if (Requested.first > Requested.second)
     246           0 :     return Default;
     247             : 
     248             :   // Make sure requested values do not violate subtarget's specifications.
     249      219962 :   if (Requested.first < getMinFlatWorkGroupSize())
     250          91 :     return Default;
     251      219871 :   if (Requested.second > getMaxFlatWorkGroupSize())
     252           0 :     return Default;
     253             : 
     254      219871 :   return Requested;
     255             : }
     256             : 
     257       35074 : std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
     258             :   const Function &F) const {
     259             :   // Default minimum/maximum number of waves per execution unit.
     260             :   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
     261             : 
     262             :   // Default/requested minimum/maximum flat work group sizes.
     263       35074 :   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
     264             : 
     265             :   // If minimum/maximum flat work group sizes were explicitly requested using
     266             :   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
     267             :   // number of waves per execution unit to values implied by requested
     268             :   // minimum/maximum flat work group sizes.
     269             :   unsigned MinImpliedByFlatWorkGroupSize =
     270       35074 :     getMaxWavesPerEU(FlatWorkGroupSizes.second);
     271             :   bool RequestedFlatWorkGroupSize = false;
     272             : 
     273             :   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
     274             :   // starts using "amdgpu-flat-work-group-size" attribute.
     275       70134 :   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
     276             :       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
     277             :     Default.first = MinImpliedByFlatWorkGroupSize;
     278             :     RequestedFlatWorkGroupSize = true;
     279             :   }
     280             : 
     281             :   // Requested minimum/maximum number of waves per execution unit.
     282       70148 :   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
     283       35074 :     F, "amdgpu-waves-per-eu", Default, true);
     284             : 
     285             :   // Make sure requested minimum is less than requested maximum.
     286       35074 :   if (Requested.second && Requested.first > Requested.second)
     287           0 :     return Default;
     288             : 
     289             :   // Make sure requested values do not violate subtarget's specifications.
     290       70148 :   if (Requested.first < getMinWavesPerEU() ||
     291             :       Requested.first > getMaxWavesPerEU())
     292           0 :     return Default;
     293       35074 :   if (Requested.second > getMaxWavesPerEU())
     294           0 :     return Default;
     295             : 
     296             :   // Make sure requested values are compatible with values implied by requested
     297             :   // minimum/maximum flat work group sizes.
     298       35074 :   if (RequestedFlatWorkGroupSize &&
     299             :       Requested.first < MinImpliedByFlatWorkGroupSize)
     300           2 :     return Default;
     301             : 
     302       35072 :   return Requested;
     303             : }
     304             : 
     305        3890 : bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
     306        3890 :   Function *Kernel = I->getParent()->getParent();
     307             :   unsigned MinSize = 0;
     308        3890 :   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
     309             :   bool IdQuery = false;
     310             : 
     311             :   // If reqd_work_group_size is present it narrows value down.
     312             :   if (auto *CI = dyn_cast<CallInst>(I)) {
     313             :     const Function *F = CI->getCalledFunction();
     314             :     if (F) {
     315             :       unsigned Dim = UINT_MAX;
     316        3824 :       switch (F->getIntrinsicID()) {
     317        3132 :       case Intrinsic::amdgcn_workitem_id_x:
     318             :       case Intrinsic::r600_read_tidig_x:
     319             :         IdQuery = true;
     320             :         LLVM_FALLTHROUGH;
     321             :       case Intrinsic::r600_read_local_size_x:
     322             :         Dim = 0;
     323             :         break;
     324         243 :       case Intrinsic::amdgcn_workitem_id_y:
     325             :       case Intrinsic::r600_read_tidig_y:
     326             :         IdQuery = true;
     327             :         LLVM_FALLTHROUGH;
     328             :       case Intrinsic::r600_read_local_size_y:
     329             :         Dim = 1;
     330             :         break;
     331         190 :       case Intrinsic::amdgcn_workitem_id_z:
     332             :       case Intrinsic::r600_read_tidig_z:
     333             :         IdQuery = true;
     334             :         LLVM_FALLTHROUGH;
     335             :       case Intrinsic::r600_read_local_size_z:
     336             :         Dim = 2;
     337             :         break;
     338             :       default:
     339             :         break;
     340             :       }
     341             :       if (Dim <= 3) {
     342        7648 :         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
     343           8 :           if (Node->getNumOperands() == 3)
     344           8 :             MinSize = MaxSize = mdconst::extract<ConstantInt>(
     345             :                                   Node->getOperand(Dim))->getZExtValue();
     346             :       }
     347             :     }
     348             :   }
     349             : 
     350        3890 :   if (!MaxSize)
     351             :     return false;
     352             : 
     353             :   // Range metadata is [Lo, Hi). For ID query we need to pass max size
     354             :   // as Hi. For size query we need to pass Hi + 1.
     355        3890 :   if (IdQuery)
     356             :     MinSize = 0;
     357             :   else
     358         325 :     ++MaxSize;
     359             : 
     360        3890 :   MDBuilder MDB(I->getContext());
     361        7780 :   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
     362       11670 :                                                   APInt(32, MaxSize));
     363        3890 :   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
     364        3890 :   return true;
     365             : }
     366             : 
     367         284 : R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
     368         284 :                              const TargetMachine &TM) :
     369             :   AMDGPUSubtarget(TT, GPU, FS, TM),
     370             :   InstrInfo(*this),
     371             :   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
     372         568 :   TLInfo(TM, *this) {}
     373             : 
     374        2205 : SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     375        2205 :                          const GCNTargetMachine &TM)
     376             :     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
     377             :       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
     378        4410 :       TLInfo(TM, *this) {
     379        2205 :   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
     380        2205 :   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
     381             : 
     382        2205 :   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
     383             :   InstSelector.reset(new AMDGPUInstructionSelector(
     384        2205 :       *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
     385        2205 : }
     386             : 
     387       41390 : void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
     388             :                                       unsigned NumRegionInstrs) const {
     389             :   // Track register pressure so the scheduler can try to decrease
     390             :   // pressure once register usage is above the threshold defined by
     391             :   // SIRegisterInfo::getRegPressureSetLimit()
     392       41390 :   Policy.ShouldTrackPressure = true;
     393             : 
     394             :   // Enabling both top down and bottom up scheduling seems to give us less
     395             :   // register spills than just using one of these approaches on its own.
     396       41390 :   Policy.OnlyTopDown = false;
     397       41390 :   Policy.OnlyBottomUp = false;
     398             : 
     399             :   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
     400       41390 :   if (!enableSIScheduler())
     401       41387 :     Policy.ShouldTrackLaneMasks = true;
     402       41390 : }
     403             : 
     404       21416 : bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
     405       42440 :   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
     406             : }
     407             : 
     408        3715 : unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
     409             :                                             unsigned ExplicitArgBytes) const {
     410        3715 :   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
     411        3715 :   if (ImplicitBytes == 0)
     412             :     return ExplicitArgBytes;
     413             : 
     414             :   unsigned Alignment = getAlignmentForImplicitArgPtr();
     415         198 :   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
     416             : }
     417             : 
     418        1169 : unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
     419        1169 :   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
     420         627 :     if (SGPRs <= 80)
     421             :       return 10;
     422          55 :     if (SGPRs <= 88)
     423             :       return 9;
     424          53 :     if (SGPRs <= 100)
     425             :       return 8;
     426          48 :     return 7;
     427             :   }
     428         542 :   if (SGPRs <= 48)
     429             :     return 10;
     430         128 :   if (SGPRs <= 56)
     431             :     return 9;
     432         115 :   if (SGPRs <= 64)
     433             :     return 8;
     434         102 :   if (SGPRs <= 72)
     435             :     return 7;
     436          61 :   if (SGPRs <= 80)
     437             :     return 6;
     438          61 :   return 5;
     439             : }
     440             : 
     441        1169 : unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
     442        1169 :   if (VGPRs <= 24)
     443             :     return 10;
     444        1005 :   if (VGPRs <= 28)
     445             :     return 9;
     446         977 :   if (VGPRs <= 32)
     447             :     return 8;
     448         746 :   if (VGPRs <= 36)
     449             :     return 7;
     450         368 :   if (VGPRs <= 40)
     451             :     return 6;
     452         334 :   if (VGPRs <= 48)
     453             :     return 5;
     454         312 :   if (VGPRs <= 64)
     455             :     return 4;
     456         286 :   if (VGPRs <= 84)
     457             :     return 3;
     458         165 :   if (VGPRs <= 128)
     459             :     return 2;
     460         139 :   return 1;
     461             : }
     462             : 
     463      124743 : unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
     464             :   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
     465      124743 :   if (MFI.hasFlatScratchInit()) {
     466        2998 :     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
     467             :       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
     468        1255 :     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
     469             :       return 4; // FLAT_SCRATCH, VCC (in that order).
     470             :   }
     471             : 
     472      122051 :   if (isXNACKEnabled())
     473             :     return 4; // XNACK, VCC (in that order).
     474      121167 :   return 2; // VCC.
     475             : }
     476             : 
     477      124677 : unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
     478      124677 :   const Function &F = MF.getFunction();
     479             :   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
     480             : 
     481             :   // Compute maximum number of SGPRs function can use using default/requested
     482             :   // minimum number of waves per execution unit.
     483             :   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
     484             :   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
     485      124677 :   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
     486             : 
     487             :   // Check if maximum number of SGPRs was explicitly requested using
     488             :   // "amdgpu-num-sgpr" attribute.
     489      124677 :   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
     490         198 :     unsigned Requested = AMDGPU::getIntegerAttribute(
     491          66 :       F, "amdgpu-num-sgpr", MaxNumSGPRs);
     492             : 
     493             :     // Make sure requested value does not violate subtarget's specifications.
     494          66 :     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
     495             :       Requested = 0;
     496             : 
     497             :     // If more SGPRs are required to support the input user/system SGPRs,
     498             :     // increase to accommodate them.
     499             :     //
     500             :     // FIXME: This really ends up using the requested number of SGPRs + number
     501             :     // of reserved special registers in total. Theoretically you could re-use
     502             :     // the last input registers for these special registers, but this would
     503             :     // require a lot of complexity to deal with the weird aliasing.
     504          66 :     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
     505          66 :     if (Requested && Requested < InputNumSGPRs)
     506             :       Requested = InputNumSGPRs;
     507             : 
     508             :     // Make sure requested value is compatible with values implied by
     509             :     // default/requested minimum/maximum number of waves per execution unit.
     510         132 :     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
     511             :       Requested = 0;
     512          66 :     if (WavesPerEU.second &&
     513         132 :         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
     514             :       Requested = 0;
     515             : 
     516          66 :     if (Requested)
     517             :       MaxNumSGPRs = Requested;
     518             :   }
     519             : 
     520      124677 :   if (hasSGPRInitBug())
     521             :     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
     522             : 
     523      249354 :   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
     524      249354 :                   MaxAddressableNumSGPRs);
     525             : }
     526             : 
     527       90713 : unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
     528       90713 :   const Function &F = MF.getFunction();
     529             :   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
     530             : 
     531             :   // Compute maximum number of VGPRs function can use using default/requested
     532             :   // minimum number of waves per execution unit.
     533             :   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
     534             :   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
     535             : 
     536             :   // Check if maximum number of VGPRs was explicitly requested using
     537             :   // "amdgpu-num-vgpr" attribute.
     538       90713 :   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
     539          72 :     unsigned Requested = AMDGPU::getIntegerAttribute(
     540          24 :       F, "amdgpu-num-vgpr", MaxNumVGPRs);
     541             : 
     542             :     // Make sure requested value does not violate subtarget's specifications.
     543          48 :     if (Requested && Requested <= getReservedNumVGPRs(MF))
     544             :       Requested = 0;
     545             : 
     546             :     // Make sure requested value is compatible with values implied by
     547             :     // default/requested minimum/maximum number of waves per execution unit.
     548          48 :     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
     549             :       Requested = 0;
     550          24 :     if (WavesPerEU.second &&
     551          48 :         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
     552             :       Requested = 0;
     553             : 
     554          24 :     if (Requested)
     555             :       MaxNumVGPRs = Requested;
     556             :   }
     557             : 
     558       90713 :   return MaxNumVGPRs - getReservedNumVGPRs(MF);
     559             : }
     560             : 
     561             : namespace {
     562       13826 : struct MemOpClusterMutation : ScheduleDAGMutation {
     563             :   const SIInstrInfo *TII;
     564             : 
     565       13826 :   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
     566             : 
     567       31464 :   void apply(ScheduleDAGInstrs *DAGInstrs) override {
     568             :     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
     569             : 
     570             :     SUnit *SUa = nullptr;
     571             :     // Search for two consequent memory operations and link them
     572             :     // to prevent scheduler from moving them apart.
     573             :     // In DAG pre-process SUnits are in the original order of
     574             :     // the instructions before scheduling.
     575      228408 :     for (SUnit &SU : DAG->SUnits) {
     576      196944 :       MachineInstr &MI2 = *SU.getInstr();
     577      196944 :       if (!MI2.mayLoad() && !MI2.mayStore()) {
     578             :         SUa = nullptr;
     579      132380 :         continue;
     580             :       }
     581       64564 :       if (!SUa) {
     582             :         SUa = &SU;
     583       35212 :         continue;
     584             :       }
     585             : 
     586       29352 :       MachineInstr &MI1 = *SUa->getInstr();
     587       19646 :       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
     588       20078 :           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
     589       17504 :           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
     590        2823 :           (TII->isDS(MI1)   && TII->isDS(MI2))) {
     591       27341 :         SU.addPredBarrier(SUa);
     592             : 
     593      490311 :         for (const SDep &SI : SU.Preds) {
     594      231485 :           if (SI.getSUnit() != SUa)
     595      187769 :             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
     596             :         }
     597             : 
     598       27341 :         if (&SU != &DAG->ExitSU) {
     599      585885 :           for (const SDep &SI : SUa->Succs) {
     600      279272 :             if (SI.getSUnit() != &SU)
     601      235556 :               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
     602             :           }
     603             :         }
     604             :       }
     605             : 
     606             :       SUa = &SU;
     607             :     }
     608       31464 :   }
     609             : };
     610             : } // namespace
     611             : 
     612       13826 : void SISubtarget::getPostRAMutations(
     613             :     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
     614       27652 :   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
     615       13826 : }

Generated by: LCOV version 1.13