LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUPerfHintAnalysis.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 71 121 58.7 %
Date: 2018-10-20 13:21:21 Functions: 9 16 56.2 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// \brief Analyzes if a function potentially memory bound and if a kernel
      12             : /// kernel may benefit from limiting number of waves to reduce cache thrashing.
      13             : ///
      14             : //===----------------------------------------------------------------------===//
      15             : 
      16             : #include "AMDGPU.h"
      17             : #include "AMDGPUPerfHintAnalysis.h"
      18             : #include "Utils/AMDGPUBaseInfo.h"
      19             : #include "llvm/ADT/SmallSet.h"
      20             : #include "llvm/ADT/Statistic.h"
      21             : #include "llvm/Analysis/ValueTracking.h"
      22             : #include "llvm/CodeGen/TargetLowering.h"
      23             : #include "llvm/CodeGen/TargetPassConfig.h"
      24             : #include "llvm/CodeGen/TargetSubtargetInfo.h"
      25             : #include "llvm/IR/Constants.h"
      26             : #include "llvm/IR/Instructions.h"
      27             : #include "llvm/IR/IntrinsicInst.h"
      28             : #include "llvm/IR/Module.h"
      29             : #include "llvm/IR/ValueMap.h"
      30             : #include "llvm/Support/CommandLine.h"
      31             : 
      32             : using namespace llvm;
      33             : 
      34             : #define DEBUG_TYPE "amdgpu-perf-hint"
      35             : 
      36             : static cl::opt<unsigned>
      37             :     MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden,
      38             :                    cl::desc("Function mem bound threshold in %"));
      39             : 
      40             : static cl::opt<unsigned>
      41             :     LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden,
      42             :                     cl::desc("Kernel limit wave threshold in %"));
      43             : 
      44             : static cl::opt<unsigned>
      45             :     IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden,
      46             :              cl::desc("Indirect access memory instruction weight"));
      47             : 
      48             : static cl::opt<unsigned>
      49             :     LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden,
      50             :              cl::desc("Large stride memory access weight"));
      51             : 
      52             : static cl::opt<unsigned>
      53             :     LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden,
      54             :                       cl::desc("Large stride memory access threshold"));
      55             : 
      56             : STATISTIC(NumMemBound, "Number of functions marked as memory bound");
      57             : STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
      58             : 
      59             : char llvm::AMDGPUPerfHintAnalysis::ID = 0;
      60             : char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID;
      61             : 
      62      170210 : INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE,
      63             :                 "Analysis if a function is memory bound", true, true)
      64             : 
      65             : namespace {
      66             : 
      67             : struct AMDGPUPerfHint {
      68             :   friend AMDGPUPerfHintAnalysis;
      69             : 
      70             : public:
      71             :   AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
      72             :                  const TargetLowering *TLI_)
      73       44020 :       : FIM(FIM_), DL(nullptr), TLI(TLI_) {}
      74             : 
      75             :   void runOnFunction(Function &F);
      76             : 
      77             : private:
      78             :   struct MemAccessInfo {
      79             :     const Value *V;
      80             :     const Value *Base;
      81             :     int64_t Offset;
      82       22010 :     MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {}
      83             :     bool isLargeStride(MemAccessInfo &Reference) const;
      84             : #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
      85             :     Printable print() const {
      86             :       return Printable([this](raw_ostream &OS) {
      87             :         OS << "Value: " << *V << '\n'
      88             :            << "Base: " << *Base << " Offset: " << Offset << '\n';
      89             :       });
      90             :     }
      91             : #endif
      92             :   };
      93             : 
      94             :   MemAccessInfo makeMemAccessInfo(Instruction *) const;
      95             : 
      96             :   MemAccessInfo LastAccess; // Last memory access info
      97             : 
      98             :   AMDGPUPerfHintAnalysis::FuncInfoMap &FIM;
      99             : 
     100             :   const DataLayout *DL;
     101             : 
     102             :   const TargetLowering *TLI;
     103             : 
     104             :   void visit(const Function &F);
     105             :   static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
     106             :   static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
     107             : 
     108             :   bool isIndirectAccess(const Instruction *Inst) const;
     109             : 
     110             :   /// Check if the instruction is large stride.
     111             :   /// The purpose is to identify memory access pattern like:
     112             :   /// x = a[i];
     113             :   /// y = a[i+1000];
     114             :   /// z = a[i+2000];
     115             :   /// In the above example, the second and third memory access will be marked
     116             :   /// large stride memory access.
     117             :   bool isLargeStride(const Instruction *Inst);
     118             : 
     119             :   bool isGlobalAddr(const Value *V) const;
     120             :   bool isLocalAddr(const Value *V) const;
     121             :   bool isConstantAddr(const Value *V) const;
     122             : };
     123             : 
     124      410194 : static const Value *getMemoryInstrPtr(const Instruction *Inst) {
     125             :   if (auto LI = dyn_cast<LoadInst>(Inst)) {
     126             :     return LI->getPointerOperand();
     127             :   }
     128             :   if (auto SI = dyn_cast<StoreInst>(Inst)) {
     129             :     return SI->getPointerOperand();
     130             :   }
     131             :   if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
     132             :     return AI->getPointerOperand();
     133             :   }
     134             :   if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) {
     135             :     return AI->getPointerOperand();
     136             :   }
     137             :   if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
     138             :     return MI->getRawDest();
     139             :   }
     140             : 
     141             :   return nullptr;
     142             : }
     143             : 
     144           0 : bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
     145             :   LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
     146             :   SmallSet<const Value *, 32> WorkSet;
     147             :   SmallSet<const Value *, 32> Visited;
     148           0 :   if (const Value *MO = getMemoryInstrPtr(Inst)) {
     149           0 :     if (isGlobalAddr(MO))
     150           0 :       WorkSet.insert(MO);
     151             :   }
     152             : 
     153           0 :   while (!WorkSet.empty()) {
     154           0 :     const Value *V = *WorkSet.begin();
     155           0 :     WorkSet.erase(*WorkSet.begin());
     156           0 :     if (!Visited.insert(V).second)
     157           0 :       continue;
     158             :     LLVM_DEBUG(dbgs() << "  check: " << *V << '\n');
     159             : 
     160             :     if (auto LD = dyn_cast<LoadInst>(V)) {
     161             :       auto M = LD->getPointerOperand();
     162           0 :       if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) {
     163             :         LLVM_DEBUG(dbgs() << "    is IA\n");
     164           0 :         return true;
     165             :       }
     166           0 :       continue;
     167             :     }
     168             : 
     169             :     if (auto GEP = dyn_cast<GetElementPtrInst>(V)) {
     170             :       auto P = GEP->getPointerOperand();
     171           0 :       WorkSet.insert(P);
     172           0 :       for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)
     173           0 :         WorkSet.insert(GEP->getOperand(I));
     174           0 :       continue;
     175             :     }
     176             : 
     177           0 :     if (auto U = dyn_cast<UnaryInstruction>(V)) {
     178           0 :       WorkSet.insert(U->getOperand(0));
     179           0 :       continue;
     180             :     }
     181             : 
     182           0 :     if (auto BO = dyn_cast<BinaryOperator>(V)) {
     183           0 :       WorkSet.insert(BO->getOperand(0));
     184           0 :       WorkSet.insert(BO->getOperand(1));
     185           0 :       continue;
     186             :     }
     187             : 
     188           0 :     if (auto S = dyn_cast<SelectInst>(V)) {
     189           0 :       WorkSet.insert(S->getFalseValue());
     190           0 :       WorkSet.insert(S->getTrueValue());
     191           0 :       continue;
     192             :     }
     193             : 
     194           0 :     if (auto E = dyn_cast<ExtractElementInst>(V)) {
     195           0 :       WorkSet.insert(E->getVectorOperand());
     196           0 :       continue;
     197             :     }
     198             : 
     199             :     LLVM_DEBUG(dbgs() << "    dropped\n");
     200             :   }
     201             : 
     202             :   LLVM_DEBUG(dbgs() << "  is not IA\n");
     203             :   return false;
     204             : }
     205             : 
     206       22221 : void AMDGPUPerfHint::visit(const Function &F) {
     207       44442 :   auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo()));
     208       22221 :   if (!FIP.second)
     209         210 :     return;
     210             : 
     211             :   AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second;
     212             : 
     213             :   LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
     214             : 
     215       46663 :   for (auto &B : F) {
     216       24652 :     LastAccess = MemAccessInfo();
     217      317652 :     for (auto &I : B) {
     218      293000 :       if (getMemoryInstrPtr(&I)) {
     219       58597 :         if (isIndirectAccess(&I))
     220       21792 :           ++FI.IAMInstCount;
     221       58597 :         if (isLargeStride(&I))
     222          50 :           ++FI.LSMInstCount;
     223       58597 :         ++FI.MemInstCount;
     224       58597 :         ++FI.InstCount;
     225       58597 :         continue;
     226             :       }
     227             :       CallSite CS(const_cast<Instruction *>(&I));
     228      234403 :       if (CS) {
     229             :         Function *Callee = CS.getCalledFunction();
     230       32774 :         if (!Callee || Callee->isDeclaration()) {
     231       33885 :           ++FI.InstCount;
     232       33885 :           continue;
     233             :         }
     234         214 :         if (&F == Callee) // Handle immediate recursion
     235             :           continue;
     236             : 
     237         211 :         visit(*Callee);
     238         211 :         auto Loc = FIM.find(Callee);
     239             : 
     240             :         assert(Loc != FIM.end() && "No func info");
     241         211 :         FI.MemInstCount += Loc->second.MemInstCount;
     242         211 :         FI.InstCount += Loc->second.InstCount;
     243         211 :         FI.IAMInstCount += Loc->second.IAMInstCount;
     244         211 :         FI.LSMInstCount += Loc->second.LSMInstCount;
     245             :       } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
     246       41169 :         TargetLoweringBase::AddrMode AM;
     247       41169 :         auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
     248       41169 :         AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
     249       41169 :         AM.HasBaseReg = !AM.BaseGV;
     250       41169 :         if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
     251       41169 :                                        GEP->getPointerAddressSpace()))
     252             :           // Offset will likely be folded into load or store
     253       27866 :           continue;
     254       13303 :         ++FI.InstCount;
     255             :       } else {
     256      159135 :         ++FI.InstCount;
     257             :       }
     258             :     }
     259             :   }
     260             : }
     261             : 
     262       22010 : void AMDGPUPerfHint::runOnFunction(Function &F) {
     263       44020 :   if (FIM.find(&F) != FIM.end())
     264             :     return;
     265             : 
     266       22010 :   const Module &M = *F.getParent();
     267       22010 :   DL = &M.getDataLayout();
     268             : 
     269       22010 :   visit(F);
     270       22010 :   auto Loc = FIM.find(&F);
     271             : 
     272             :   assert(Loc != FIM.end() && "No func info");
     273             :   LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Loc->second.MemInstCount
     274             :                     << '\n'
     275             :                     << " IAMInst: " << Loc->second.IAMInstCount << '\n'
     276             :                     << " LSMInst: " << Loc->second.LSMInstCount << '\n'
     277             :                     << " TotalInst: " << Loc->second.InstCount << '\n');
     278             : 
     279             :   auto &FI = Loc->second;
     280             : 
     281             :   if (isMemBound(FI)) {
     282             :     LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
     283             :     NumMemBound++;
     284             :   }
     285             : 
     286             :   if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) {
     287             :     LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
     288             :     NumLimitWave++;
     289             :   }
     290             : }
     291             : 
     292           0 : bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
     293       21813 :   return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;
     294             : }
     295             : 
     296             : bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
     297       65439 :   return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +
     298       43626 :            FI.LSMInstCount * LSWeight) *
     299       43626 :           100 / FI.InstCount) > LimitWaveThresh;
     300             : }
     301             : 
     302           0 : bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
     303           0 :   if (auto PT = dyn_cast<PointerType>(V->getType())) {
     304             :     unsigned As = PT->getAddressSpace();
     305             :     // Flat likely points to global too.
     306           0 :     return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS;
     307             :   }
     308             :   return false;
     309             : }
     310             : 
     311           0 : bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
     312           0 :   if (auto PT = dyn_cast<PointerType>(V->getType()))
     313           0 :     return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
     314             :   return false;
     315             : }
     316             : 
     317       58597 : bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
     318             :   LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');
     319             : 
     320       58597 :   MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
     321             :   bool IsLargeStride = MAI.isLargeStride(LastAccess);
     322       58597 :   if (MAI.Base)
     323       53514 :     LastAccess = std::move(MAI);
     324             : 
     325       58597 :   return IsLargeStride;
     326             : }
     327             : 
     328             : AMDGPUPerfHint::MemAccessInfo
     329           0 : AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
     330             :   MemAccessInfo MAI;
     331           0 :   const Value *MO = getMemoryInstrPtr(Inst);
     332             : 
     333             :   LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
     334             :   // Do not treat local-addr memory access as large stride.
     335           0 :   if (isLocalAddr(MO))
     336             :     return MAI;
     337             : 
     338           0 :   MAI.V = MO;
     339           0 :   MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
     340           0 :   return MAI;
     341             : }
     342             : 
     343           0 : bool AMDGPUPerfHint::isConstantAddr(const Value *V) const {
     344           0 :   if (auto PT = dyn_cast<PointerType>(V->getType())) {
     345             :     unsigned As = PT->getAddressSpace();
     346           0 :     return As == AMDGPUAS::CONSTANT_ADDRESS ||
     347           0 :            As == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
     348             :   }
     349             :   return false;
     350             : }
     351             : 
     352           0 : bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
     353             :     MemAccessInfo &Reference) const {
     354             : 
     355       58597 :   if (!Base || !Reference.Base || Base != Reference.Base)
     356           0 :     return false;
     357             : 
     358       11003 :   uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
     359        3209 :                                             : Reference.Offset - Offset;
     360       11003 :   bool Result = Diff > LargeStrideThresh;
     361             :   LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
     362             :                << print() << "<=>\n"
     363             :                << Reference.print() << "Result:" << Result << '\n');
     364           0 :   return Result;
     365             : }
     366             : } // namespace
     367             : 
     368       22010 : bool AMDGPUPerfHintAnalysis::runOnFunction(Function &F) {
     369       22010 :   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
     370       22010 :   if (!TPC)
     371             :     return false;
     372             : 
     373       22010 :   const TargetMachine &TM = TPC->getTM<TargetMachine>();
     374       22010 :   const TargetSubtargetInfo *ST = TM.getSubtargetImpl(F);
     375             : 
     376       22010 :   AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
     377       22010 :   Analyzer.runOnFunction(F);
     378       22010 :   return false;
     379             : }
     380             : 
     381       21813 : bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const {
     382             :   auto FI = FIM.find(F);
     383       21813 :   if (FI == FIM.end())
     384             :     return false;
     385             : 
     386       21813 :   return AMDGPUPerfHint::isMemBound(FI->second);
     387             : }
     388             : 
     389       21813 : bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const {
     390             :   auto FI = FIM.find(F);
     391       21813 :   if (FI == FIM.end())
     392             :     return false;
     393             : 
     394             :   return AMDGPUPerfHint::needLimitWave(FI->second);
     395             : }

Generated by: LCOV version 1.13