LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUSubtarget.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 218 226 96.5 %
Date: 2018-10-20 13:21:21 Functions: 24 25 96.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// Implements the AMDGPU specific subclass of TargetSubtarget.
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "AMDGPUSubtarget.h"
      16             : #include "AMDGPU.h"
      17             : #include "AMDGPUTargetMachine.h"
      18             : #include "AMDGPUCallLowering.h"
      19             : #include "AMDGPUInstructionSelector.h"
      20             : #include "AMDGPULegalizerInfo.h"
      21             : #include "AMDGPURegisterBankInfo.h"
      22             : #include "SIMachineFunctionInfo.h"
      23             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      24             : #include "llvm/ADT/SmallString.h"
      25             : #include "llvm/CodeGen/MachineScheduler.h"
      26             : #include "llvm/MC/MCSubtargetInfo.h"
      27             : #include "llvm/IR/MDBuilder.h"
      28             : #include "llvm/CodeGen/TargetFrameLowering.h"
      29             : #include <algorithm>
      30             : 
      31             : using namespace llvm;
      32             : 
      33             : #define DEBUG_TYPE "amdgpu-subtarget"
      34             : 
      35             : #define GET_SUBTARGETINFO_TARGET_DESC
      36             : #define GET_SUBTARGETINFO_CTOR
      37             : #define AMDGPUSubtarget GCNSubtarget
      38             : #include "AMDGPUGenSubtargetInfo.inc"
      39             : #define GET_SUBTARGETINFO_TARGET_DESC
      40             : #define GET_SUBTARGETINFO_CTOR
      41             : #undef AMDGPUSubtarget
      42             : #include "R600GenSubtargetInfo.inc"
      43             : 
      44             : GCNSubtarget::~GCNSubtarget() = default;
      45             : 
      46             : R600Subtarget &
      47         291 : R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
      48             :                                                StringRef GPU, StringRef FS) {
      49             :   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
      50             :   FullFS += FS;
      51         291 :   ParseSubtargetFeatures(GPU, FullFS);
      52             : 
      53             :   // FIXME: I don't think think Evergreen has any useful support for
      54             :   // denormals, but should be checked. Should we issue a warning somewhere
      55             :   // if someone tries to enable these?
      56         291 :   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
      57         291 :     FP32Denormals = false;
      58             :   }
      59             : 
      60         291 :   HasMulU24 = getGeneration() >= EVERGREEN;
      61         291 :   HasMulI24 = hasCaymanISA();
      62             : 
      63         291 :   return *this;
      64             : }
      65             : 
      66             : GCNSubtarget &
      67        2492 : GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
      68             :                                                  StringRef GPU, StringRef FS) {
      69             :   // Determine default and user-specified characteristics
      70             :   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
      71             :   // enabled, but some instructions do not respect them and they run at the
      72             :   // double precision rate, so don't enable by default.
      73             :   //
      74             :   // We want to be able to turn these off, but making this a subtarget feature
      75             :   // for SI has the unhelpful behavior that it unsets everything else if you
      76             :   // disable it.
      77             : 
      78             :   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
      79             : 
      80        2492 :   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
      81             :     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
      82             : 
      83             :   // FIXME: I don't think think Evergreen has any useful support for
      84             :   // denormals, but should be checked. Should we issue a warning somewhere
      85             :   // if someone tries to enable these?
      86        2492 :   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
      87             :     FullFS += "+fp64-fp16-denormals,";
      88             :   } else {
      89             :     FullFS += "-fp32-denormals,";
      90             :   }
      91             : 
      92             :   FullFS += FS;
      93             : 
      94        2492 :   ParseSubtargetFeatures(GPU, FullFS);
      95             : 
      96             :   // We don't support FP64 for EG/NI atm.
      97             :   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
      98             : 
      99             :   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
     100             :   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
     101             :   // variants of MUBUF instructions.
     102        3731 :   if (!hasAddr64() && !FS.contains("flat-for-global")) {
     103         886 :     FlatForGlobal = true;
     104             :   }
     105             : 
     106             :   // Set defaults if needed.
     107        2492 :   if (MaxPrivateElementSize == 0)
     108        2471 :     MaxPrivateElementSize = 4;
     109             : 
     110        2492 :   if (LDSBankCount == 0)
     111         720 :     LDSBankCount = 32;
     112             : 
     113        2492 :   if (TT.getArch() == Triple::amdgcn) {
     114        2473 :     if (LocalMemorySize == 0)
     115         701 :       LocalMemorySize = 32768;
     116             : 
     117             :     // Do something sensible for unspecified target.
     118        2473 :     if (!HasMovrel && !HasVGPRIndexMode)
     119         701 :       HasMovrel = true;
     120             :   }
     121             : 
     122        2492 :   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
     123             : 
     124        2492 :   return *this;
     125             : }
     126             : 
     127        2783 : AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
     128             :   TargetTriple(TT),
     129             :   Has16BitInsts(false),
     130             :   HasMadMixInsts(false),
     131             :   FP32Denormals(false),
     132             :   FPExceptions(false),
     133             :   HasSDWA(false),
     134             :   HasVOP3PInsts(false),
     135             :   HasMulI24(true),
     136             :   HasMulU24(true),
     137             :   HasInv2PiInlineImm(false),
     138             :   HasFminFmaxLegacy(true),
     139             :   EnablePromoteAlloca(false),
     140             :   HasTrigReducedRange(false),
     141             :   LocalMemorySize(0),
     142        2783 :   WavefrontSize(0)
     143        2783 :   { }
     144             : 
     145        2492 : GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     146        2492 :                            const GCNTargetMachine &TM) :
     147             :     AMDGPUGenSubtargetInfo(TT, GPU, FS),
     148             :     AMDGPUSubtarget(TT),
     149             :     TargetTriple(TT),
     150             :     Gen(SOUTHERN_ISLANDS),
     151             :     IsaVersion(ISAVersion0_0_0),
     152        2492 :     InstrItins(getInstrItineraryForCPU(GPU)),
     153             :     LDSBankCount(0),
     154             :     MaxPrivateElementSize(0),
     155             : 
     156             :     FastFMAF32(false),
     157             :     HalfRate64Ops(false),
     158             : 
     159             :     FP64FP16Denormals(false),
     160             :     DX10Clamp(false),
     161             :     FlatForGlobal(false),
     162             :     AutoWaitcntBeforeBarrier(false),
     163             :     CodeObjectV3(false),
     164             :     UnalignedScratchAccess(false),
     165             :     UnalignedBufferAccess(false),
     166             : 
     167             :     HasApertureRegs(false),
     168             :     EnableXNACK(false),
     169             :     TrapHandler(false),
     170             :     DebuggerInsertNops(false),
     171             :     DebuggerEmitPrologue(false),
     172             : 
     173             :     EnableHugePrivateBuffer(false),
     174             :     EnableVGPRSpilling(false),
     175             :     EnableLoadStoreOpt(false),
     176             :     EnableUnsafeDSOffsetFolding(false),
     177             :     EnableSIScheduler(false),
     178             :     EnableDS128(false),
     179             :     DumpCode(false),
     180             : 
     181             :     FP64(false),
     182             :     GCN3Encoding(false),
     183             :     CIInsts(false),
     184             :     VIInsts(false),
     185             :     GFX9Insts(false),
     186             :     SGPRInitBug(false),
     187             :     HasSMemRealTime(false),
     188             :     HasIntClamp(false),
     189             :     HasFmaMixInsts(false),
     190             :     HasMovrel(false),
     191             :     HasVGPRIndexMode(false),
     192             :     HasScalarStores(false),
     193             :     HasScalarAtomics(false),
     194             :     HasSDWAOmod(false),
     195             :     HasSDWAScalar(false),
     196             :     HasSDWASdst(false),
     197             :     HasSDWAMac(false),
     198             :     HasSDWAOutModsVOPC(false),
     199             :     HasDPP(false),
     200             :     HasR128A16(false),
     201             :     HasDLInsts(false),
     202             :     D16PreservesUnusedBits(false),
     203             :     FlatAddressSpace(false),
     204             :     FlatInstOffsets(false),
     205             :     FlatGlobalInsts(false),
     206             :     FlatScratchInsts(false),
     207             :     AddNoCarryInsts(false),
     208             :     HasUnpackedD16VMem(false),
     209             : 
     210             :     ScalarizeGlobal(false),
     211             : 
     212             :     FeatureDisable(false),
     213        2492 :     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
     214             :     TLInfo(TM, *this),
     215        2492 :     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
     216        2492 :   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
     217        2492 :   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
     218        2492 :   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
     219             :   InstSelector.reset(new AMDGPUInstructionSelector(
     220        2492 :   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
     221        2492 : }
     222             : 
     223       18943 : unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
     224             :   const Function &F) const {
     225       18943 :   if (NWaves == 1)
     226          26 :     return getLocalMemorySize();
     227       18917 :   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
     228       18917 :   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
     229             :   unsigned MaxWaves = getMaxWavesPerEU();
     230       18917 :   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
     231             : }
     232             : 
     233      184036 : unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
     234             :   const Function &F) const {
     235      184036 :   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
     236      184036 :   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
     237      184036 :   unsigned MaxWaves = getMaxWavesPerEU();
     238      184036 :   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
     239      364852 :   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
     240      184036 :   NumWaves = std::min(NumWaves, MaxWaves);
     241      184036 :   NumWaves = std::max(NumWaves, 1u);
     242      184036 :   return NumWaves;
     243             : }
     244             : 
     245             : unsigned
     246           0 : AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
     247             :   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
     248           0 :   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
     249             : }
     250             : 
     251             : std::pair<unsigned, unsigned>
     252      267721 : AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
     253             :   switch (CC) {
     254      219060 :   case CallingConv::AMDGPU_CS:
     255             :   case CallingConv::AMDGPU_KERNEL:
     256             :   case CallingConv::SPIR_KERNEL:
     257      219060 :     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
     258       23495 :   case CallingConv::AMDGPU_VS:
     259             :   case CallingConv::AMDGPU_LS:
     260             :   case CallingConv::AMDGPU_HS:
     261             :   case CallingConv::AMDGPU_ES:
     262             :   case CallingConv::AMDGPU_GS:
     263             :   case CallingConv::AMDGPU_PS:
     264       23495 :     return std::make_pair(1, getWavefrontSize());
     265       25166 :   default:
     266       25166 :     return std::make_pair(1, 16 * getWavefrontSize());
     267             :   }
     268             : }
     269             : 
     270      267721 : std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
     271             :   const Function &F) const {
     272             :   // FIXME: 1024 if function.
     273             :   // Default minimum/maximum flat work group sizes.
     274             :   std::pair<unsigned, unsigned> Default =
     275      267721 :     getDefaultFlatWorkGroupSize(F.getCallingConv());
     276             : 
     277             :   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
     278             :   // starts using "amdgpu-flat-work-group-size" attribute.
     279      267721 :   Default.second = AMDGPU::getIntegerAttribute(
     280      267721 :     F, "amdgpu-max-work-group-size", Default.second);
     281      267721 :   Default.first = std::min(Default.first, Default.second);
     282             : 
     283             :   // Requested minimum/maximum flat work group sizes.
     284      267721 :   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
     285      267721 :     F, "amdgpu-flat-work-group-size", Default);
     286             : 
     287             :   // Make sure requested minimum is less than requested maximum.
     288      267721 :   if (Requested.first > Requested.second)
     289           0 :     return Default;
     290             : 
     291             :   // Make sure requested values do not violate subtarget's specifications.
     292      267721 :   if (Requested.first < getMinFlatWorkGroupSize())
     293          73 :     return Default;
     294      267648 :   if (Requested.second > getMaxFlatWorkGroupSize())
     295           0 :     return Default;
     296             : 
     297      267648 :   return Requested;
     298             : }
     299             : 
     300       39614 : std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
     301             :   const Function &F) const {
     302             :   // Default minimum/maximum number of waves per execution unit.
     303             :   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
     304             : 
     305             :   // Default/requested minimum/maximum flat work group sizes.
     306       39614 :   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
     307             : 
     308             :   // If minimum/maximum flat work group sizes were explicitly requested using
     309             :   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
     310             :   // number of waves per execution unit to values implied by requested
     311             :   // minimum/maximum flat work group sizes.
     312             :   unsigned MinImpliedByFlatWorkGroupSize =
     313       39614 :     getMaxWavesPerEU(FlatWorkGroupSizes.second);
     314             :   bool RequestedFlatWorkGroupSize = false;
     315             : 
     316             :   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
     317             :   // starts using "amdgpu-flat-work-group-size" attribute.
     318       39614 :   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
     319       39600 :       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
     320             :     Default.first = MinImpliedByFlatWorkGroupSize;
     321             :     RequestedFlatWorkGroupSize = true;
     322             :   }
     323             : 
     324             :   // Requested minimum/maximum number of waves per execution unit.
     325       39614 :   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
     326       39614 :     F, "amdgpu-waves-per-eu", Default, true);
     327             : 
     328             :   // Make sure requested minimum is less than requested maximum.
     329       39614 :   if (Requested.second && Requested.first > Requested.second)
     330           0 :     return Default;
     331             : 
     332             :   // Make sure requested values do not violate subtarget's specifications.
     333       39614 :   if (Requested.first < getMinWavesPerEU() ||
     334             :       Requested.first > getMaxWavesPerEU())
     335           0 :     return Default;
     336       39614 :   if (Requested.second > getMaxWavesPerEU())
     337           0 :     return Default;
     338             : 
     339             :   // Make sure requested values are compatible with values implied by requested
     340             :   // minimum/maximum flat work group sizes.
     341       39614 :   if (RequestedFlatWorkGroupSize &&
     342             :       Requested.first < MinImpliedByFlatWorkGroupSize)
     343           2 :     return Default;
     344             : 
     345       39612 :   return Requested;
     346             : }
     347             : 
     348        3969 : bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
     349        3969 :   Function *Kernel = I->getParent()->getParent();
     350             :   unsigned MinSize = 0;
     351        3969 :   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
     352             :   bool IdQuery = false;
     353             : 
     354             :   // If reqd_work_group_size is present it narrows value down.
     355             :   if (auto *CI = dyn_cast<CallInst>(I)) {
     356             :     const Function *F = CI->getCalledFunction();
     357             :     if (F) {
     358             :       unsigned Dim = UINT_MAX;
     359        3903 :       switch (F->getIntrinsicID()) {
     360        3211 :       case Intrinsic::amdgcn_workitem_id_x:
     361             :       case Intrinsic::r600_read_tidig_x:
     362             :         IdQuery = true;
     363             :         LLVM_FALLTHROUGH;
     364             :       case Intrinsic::r600_read_local_size_x:
     365             :         Dim = 0;
     366             :         break;
     367         243 :       case Intrinsic::amdgcn_workitem_id_y:
     368             :       case Intrinsic::r600_read_tidig_y:
     369             :         IdQuery = true;
     370             :         LLVM_FALLTHROUGH;
     371             :       case Intrinsic::r600_read_local_size_y:
     372             :         Dim = 1;
     373             :         break;
     374         190 :       case Intrinsic::amdgcn_workitem_id_z:
     375             :       case Intrinsic::r600_read_tidig_z:
     376             :         IdQuery = true;
     377             :         LLVM_FALLTHROUGH;
     378             :       case Intrinsic::r600_read_local_size_z:
     379             :         Dim = 2;
     380             :         break;
     381             :       default:
     382             :         break;
     383             :       }
     384             :       if (Dim <= 3) {
     385        7806 :         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
     386           8 :           if (Node->getNumOperands() == 3)
     387           8 :             MinSize = MaxSize = mdconst::extract<ConstantInt>(
     388             :                                   Node->getOperand(Dim))->getZExtValue();
     389             :       }
     390             :     }
     391             :   }
     392             : 
     393        3969 :   if (!MaxSize)
     394             :     return false;
     395             : 
     396             :   // Range metadata is [Lo, Hi). For ID query we need to pass max size
     397             :   // as Hi. For size query we need to pass Hi + 1.
     398        3969 :   if (IdQuery)
     399             :     MinSize = 0;
     400             :   else
     401         325 :     ++MaxSize;
     402             : 
     403        3969 :   MDBuilder MDB(I->getContext());
     404        7938 :   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
     405        3969 :                                                   APInt(32, MaxSize));
     406        3969 :   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
     407        3969 :   return true;
     408             : }
     409             : 
     410       38433 : uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
     411             :                                                  unsigned &MaxAlign) const {
     412             :   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
     413             :          F.getCallingConv() == CallingConv::SPIR_KERNEL);
     414             : 
     415       38433 :   const DataLayout &DL = F.getParent()->getDataLayout();
     416             :   uint64_t ExplicitArgBytes = 0;
     417       38433 :   MaxAlign = 1;
     418             : 
     419      122838 :   for (const Argument &Arg : F.args()) {
     420       84405 :     Type *ArgTy = Arg.getType();
     421             : 
     422       84405 :     unsigned Align = DL.getABITypeAlignment(ArgTy);
     423       84405 :     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
     424      168810 :     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
     425       84405 :     MaxAlign = std::max(MaxAlign, Align);
     426             :   }
     427             : 
     428       38433 :   return ExplicitArgBytes;
     429             : }
     430             : 
     431       19817 : unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
     432             :                                                 unsigned &MaxAlign) const {
     433       19817 :   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
     434             : 
     435             :   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
     436             : 
     437       19817 :   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
     438       19817 :   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
     439       19817 :   if (ImplicitBytes != 0) {
     440             :     unsigned Alignment = getAlignmentForImplicitArgPtr();
     441        1756 :     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
     442             :   }
     443             : 
     444             :   // Being able to dereference past the end is useful for emitting scalar loads.
     445       19817 :   return alignTo(TotalSize, 4);
     446             : }
     447             : 
     448         291 : R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
     449         291 :                              const TargetMachine &TM) :
     450             :   R600GenSubtargetInfo(TT, GPU, FS),
     451             :   AMDGPUSubtarget(TT),
     452             :   InstrInfo(*this),
     453             :   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
     454             :   FMA(false),
     455             :   CaymanISA(false),
     456             :   CFALUBug(false),
     457             :   DX10Clamp(false),
     458             :   HasVertexCache(false),
     459             :   R600ALUInst(false),
     460             :   FP64(false),
     461             :   TexVTXClauseSize(0),
     462             :   Gen(R600),
     463         291 :   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
     464         582 :   InstrItins(getInstrItineraryForCPU(GPU)) { }
     465             : 
     466       47458 : void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
     467             :                                       unsigned NumRegionInstrs) const {
     468             :   // Track register pressure so the scheduler can try to decrease
     469             :   // pressure once register usage is above the threshold defined by
     470             :   // SIRegisterInfo::getRegPressureSetLimit()
     471       47458 :   Policy.ShouldTrackPressure = true;
     472             : 
     473             :   // Enabling both top down and bottom up scheduling seems to give us less
     474             :   // register spills than just using one of these approaches on its own.
     475       47458 :   Policy.OnlyTopDown = false;
     476       47458 :   Policy.OnlyBottomUp = false;
     477             : 
     478             :   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
     479       47458 :   if (!enableSIScheduler())
     480       47455 :     Policy.ShouldTrackLaneMasks = true;
     481       47458 : }
     482             : 
     483       24723 : bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
     484       24723 :   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
     485             : }
     486             : 
     487        1715 : unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
     488        1715 :   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
     489        1145 :     if (SGPRs <= 80)
     490             :       return 10;
     491          63 :     if (SGPRs <= 88)
     492             :       return 9;
     493          63 :     if (SGPRs <= 100)
     494             :       return 8;
     495          56 :     return 7;
     496             :   }
     497         570 :   if (SGPRs <= 48)
     498             :     return 10;
     499         135 :   if (SGPRs <= 56)
     500             :     return 9;
     501         120 :   if (SGPRs <= 64)
     502             :     return 8;
     503         105 :   if (SGPRs <= 72)
     504             :     return 7;
     505          71 :   if (SGPRs <= 80)
     506           2 :     return 6;
     507             :   return 5;
     508             : }
     509             : 
     510        1715 : unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
     511        1715 :   if (VGPRs <= 24)
     512             :     return 10;
     513        1078 :   if (VGPRs <= 28)
     514             :     return 9;
     515        1059 :   if (VGPRs <= 32)
     516             :     return 8;
     517         826 :   if (VGPRs <= 36)
     518             :     return 7;
     519         392 :   if (VGPRs <= 40)
     520             :     return 6;
     521         364 :   if (VGPRs <= 48)
     522             :     return 5;
     523         352 :   if (VGPRs <= 64)
     524             :     return 4;
     525         323 :   if (VGPRs <= 84)
     526             :     return 3;
     527         201 :   if (VGPRs <= 128)
     528          27 :     return 2;
     529             :   return 1;
     530             : }
     531             : 
     532      141261 : unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
     533             :   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
     534      141261 :   if (MFI.hasFlatScratchInit()) {
     535        3292 :     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
     536             :       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
     537        1296 :     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
     538             :       return 4; // FLAT_SCRATCH, VCC (in that order).
     539             :   }
     540             : 
     541      138275 :   if (isXNACKEnabled())
     542        2845 :     return 4; // XNACK, VCC (in that order).
     543             :   return 2; // VCC.
     544             : }
     545             : 
     546      141201 : unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
     547      141201 :   const Function &F = MF.getFunction();
     548             :   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
     549             : 
     550             :   // Compute maximum number of SGPRs function can use using default/requested
     551             :   // minimum number of waves per execution unit.
     552             :   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
     553             :   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
     554      141201 :   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
     555             : 
     556             :   // Check if maximum number of SGPRs was explicitly requested using
     557             :   // "amdgpu-num-sgpr" attribute.
     558      141201 :   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
     559         120 :     unsigned Requested = AMDGPU::getIntegerAttribute(
     560          60 :       F, "amdgpu-num-sgpr", MaxNumSGPRs);
     561             : 
     562             :     // Make sure requested value does not violate subtarget's specifications.
     563          60 :     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
     564             :       Requested = 0;
     565             : 
     566             :     // If more SGPRs are required to support the input user/system SGPRs,
     567             :     // increase to accommodate them.
     568             :     //
     569             :     // FIXME: This really ends up using the requested number of SGPRs + number
     570             :     // of reserved special registers in total. Theoretically you could re-use
     571             :     // the last input registers for these special registers, but this would
     572             :     // require a lot of complexity to deal with the weird aliasing.
     573          60 :     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
     574          60 :     if (Requested && Requested < InputNumSGPRs)
     575             :       Requested = InputNumSGPRs;
     576             : 
     577             :     // Make sure requested value is compatible with values implied by
     578             :     // default/requested minimum/maximum number of waves per execution unit.
     579         120 :     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
     580             :       Requested = 0;
     581          60 :     if (WavesPerEU.second &&
     582         120 :         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
     583             :       Requested = 0;
     584             : 
     585          60 :     if (Requested)
     586             :       MaxNumSGPRs = Requested;
     587             :   }
     588             : 
     589      141201 :   if (hasSGPRInitBug())
     590             :     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
     591             : 
     592      141201 :   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
     593      282402 :                   MaxAddressableNumSGPRs);
     594             : }
     595             : 
     596      103448 : unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
     597      103448 :   const Function &F = MF.getFunction();
     598             :   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
     599             : 
     600             :   // Compute maximum number of VGPRs function can use using default/requested
     601             :   // minimum number of waves per execution unit.
     602             :   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
     603             :   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
     604             : 
     605             :   // Check if maximum number of VGPRs was explicitly requested using
     606             :   // "amdgpu-num-vgpr" attribute.
     607      103448 :   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
     608          36 :     unsigned Requested = AMDGPU::getIntegerAttribute(
     609          18 :       F, "amdgpu-num-vgpr", MaxNumVGPRs);
     610             : 
     611             :     // Make sure requested value is compatible with values implied by
     612             :     // default/requested minimum/maximum number of waves per execution unit.
     613          36 :     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
     614             :       Requested = 0;
     615          18 :     if (WavesPerEU.second &&
     616          36 :         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
     617             :       Requested = 0;
     618             : 
     619          18 :     if (Requested)
     620             :       MaxNumVGPRs = Requested;
     621             :   }
     622             : 
     623      103448 :   return MaxNumVGPRs;
     624             : }
     625             : 
     626             : namespace {
     627           0 : struct MemOpClusterMutation : ScheduleDAGMutation {
     628             :   const SIInstrInfo *TII;
     629             : 
     630       15844 :   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
     631             : 
     632       36516 :   void apply(ScheduleDAGInstrs *DAGInstrs) override {
     633             :     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
     634             : 
     635             :     SUnit *SUa = nullptr;
     636             :     // Search for two consequent memory operations and link them
     637             :     // to prevent scheduler from moving them apart.
     638             :     // In DAG pre-process SUnits are in the original order of
     639             :     // the instructions before scheduling.
     640      253075 :     for (SUnit &SU : DAG->SUnits) {
     641      216559 :       MachineInstr &MI2 = *SU.getInstr();
     642      216559 :       if (!MI2.mayLoad() && !MI2.mayStore()) {
     643             :         SUa = nullptr;
     644             :         continue;
     645             :       }
     646       62542 :       if (!SUa) {
     647             :         SUa = &SU;
     648             :         continue;
     649             :       }
     650             : 
     651       24804 :       MachineInstr &MI1 = *SUa->getInstr();
     652       15748 :       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
     653       15748 :           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
     654       12423 :           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
     655        2842 :           (TII->isDS(MI1)   && TII->isDS(MI2))) {
     656       23247 :         SU.addPredBarrier(SUa);
     657             : 
     658      244768 :         for (const SDep &SI : SU.Preds) {
     659      221521 :           if (SI.getSUnit() != SUa)
     660      185202 :             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
     661             :         }
     662             : 
     663       23247 :         if (&SU != &DAG->ExitSU) {
     664      269449 :           for (const SDep &SI : SUa->Succs) {
     665      246202 :             if (SI.getSUnit() != &SU)
     666      209883 :               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
     667             :           }
     668             :         }
     669             :       }
     670             : 
     671             :       SUa = &SU;
     672             :     }
     673       36516 :   }
     674             : };
     675             : } // namespace
     676             : 
     677       15844 : void GCNSubtarget::getPostRAMutations(
     678             :     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
     679       15844 :   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
     680       15844 : }
     681             : 
     682       41429 : const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
     683       41429 :   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
     684       36865 :     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
     685             :   else
     686        4564 :     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
     687             : }
     688             : 
     689       46856 : const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
     690       46856 :   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
     691       41753 :     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
     692             :   else
     693        5103 :     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
     694             : }

Generated by: LCOV version 1.13