LLVM  9.0.0svn
AMDGPUPerfHintAnalysis.cpp
Go to the documentation of this file.
1 //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes if a function potentially memory bound and if a kernel
11 /// kernel may benefit from limiting number of waves to reduce cache thrashing.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUPerfHintAnalysis.h"
17 #include "Utils/AMDGPUBaseInfo.h"
18 #include "llvm/ADT/SmallSet.h"
19 #include "llvm/ADT/Statistic.h"
24 #include "llvm/IR/Constants.h"
25 #include "llvm/IR/Instructions.h"
26 #include "llvm/IR/IntrinsicInst.h"
27 #include "llvm/IR/Module.h"
28 #include "llvm/IR/ValueMap.h"
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "amdgpu-perf-hint"
34 
35 static cl::opt<unsigned>
36  MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden,
37  cl::desc("Function mem bound threshold in %"));
38 
39 static cl::opt<unsigned>
40  LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden,
41  cl::desc("Kernel limit wave threshold in %"));
42 
43 static cl::opt<unsigned>
44  IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden,
45  cl::desc("Indirect access memory instruction weight"));
46 
47 static cl::opt<unsigned>
48  LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden,
49  cl::desc("Large stride memory access weight"));
50 
51 static cl::opt<unsigned>
52  LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden,
53  cl::desc("Large stride memory access threshold"));
54 
55 STATISTIC(NumMemBound, "Number of functions marked as memory bound");
56 STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
57 
60 
62  "Analysis if a function is memory bound", true, true)
63 
64 namespace {
65 
66 struct AMDGPUPerfHint {
68 
69 public:
70  AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
71  const TargetLowering *TLI_)
72  : FIM(FIM_), DL(nullptr), TLI(TLI_) {}
73 
74  void runOnFunction(Function &F);
75 
76 private:
77  struct MemAccessInfo {
78  const Value *V;
79  const Value *Base;
80  int64_t Offset;
81  MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {}
82  bool isLargeStride(MemAccessInfo &Reference) const;
83 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
84  Printable print() const {
85  return Printable([this](raw_ostream &OS) {
86  OS << "Value: " << *V << '\n'
87  << "Base: " << *Base << " Offset: " << Offset << '\n';
88  });
89  }
90 #endif
91  };
92 
93  MemAccessInfo makeMemAccessInfo(Instruction *) const;
94 
95  MemAccessInfo LastAccess; // Last memory access info
96 
98 
99  const DataLayout *DL;
100 
101  const TargetLowering *TLI;
102 
103  void visit(const Function &F);
104  static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
105  static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
106 
107  bool isIndirectAccess(const Instruction *Inst) const;
108 
109  /// Check if the instruction is large stride.
110  /// The purpose is to identify memory access pattern like:
111  /// x = a[i];
112  /// y = a[i+1000];
113  /// z = a[i+2000];
114  /// In the above example, the second and third memory access will be marked
115  /// large stride memory access.
116  bool isLargeStride(const Instruction *Inst);
117 
118  bool isGlobalAddr(const Value *V) const;
119  bool isLocalAddr(const Value *V) const;
120  bool isConstantAddr(const Value *V) const;
121 };
122 
123 static const Value *getMemoryInstrPtr(const Instruction *Inst) {
124  if (auto LI = dyn_cast<LoadInst>(Inst)) {
125  return LI->getPointerOperand();
126  }
127  if (auto SI = dyn_cast<StoreInst>(Inst)) {
128  return SI->getPointerOperand();
129  }
130  if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
131  return AI->getPointerOperand();
132  }
133  if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) {
134  return AI->getPointerOperand();
135  }
136  if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
137  return MI->getRawDest();
138  }
139 
140  return nullptr;
141 }
142 
143 bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
144  LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
147  if (const Value *MO = getMemoryInstrPtr(Inst)) {
148  if (isGlobalAddr(MO))
149  WorkSet.insert(MO);
150  }
151 
152  while (!WorkSet.empty()) {
153  const Value *V = *WorkSet.begin();
154  WorkSet.erase(*WorkSet.begin());
155  if (!Visited.insert(V).second)
156  continue;
157  LLVM_DEBUG(dbgs() << " check: " << *V << '\n');
158 
159  if (auto LD = dyn_cast<LoadInst>(V)) {
160  auto M = LD->getPointerOperand();
161  if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) {
162  LLVM_DEBUG(dbgs() << " is IA\n");
163  return true;
164  }
165  continue;
166  }
167 
168  if (auto GEP = dyn_cast<GetElementPtrInst>(V)) {
169  auto P = GEP->getPointerOperand();
170  WorkSet.insert(P);
171  for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)
172  WorkSet.insert(GEP->getOperand(I));
173  continue;
174  }
175 
176  if (auto U = dyn_cast<UnaryInstruction>(V)) {
177  WorkSet.insert(U->getOperand(0));
178  continue;
179  }
180 
181  if (auto BO = dyn_cast<BinaryOperator>(V)) {
182  WorkSet.insert(BO->getOperand(0));
183  WorkSet.insert(BO->getOperand(1));
184  continue;
185  }
186 
187  if (auto S = dyn_cast<SelectInst>(V)) {
188  WorkSet.insert(S->getFalseValue());
189  WorkSet.insert(S->getTrueValue());
190  continue;
191  }
192 
193  if (auto E = dyn_cast<ExtractElementInst>(V)) {
194  WorkSet.insert(E->getVectorOperand());
195  continue;
196  }
197 
198  LLVM_DEBUG(dbgs() << " dropped\n");
199  }
200 
201  LLVM_DEBUG(dbgs() << " is not IA\n");
202  return false;
203 }
204 
205 void AMDGPUPerfHint::visit(const Function &F) {
206  auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo()));
207  if (!FIP.second)
208  return;
209 
210  AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second;
211 
212  LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
213 
214  for (auto &B : F) {
215  LastAccess = MemAccessInfo();
216  for (auto &I : B) {
217  if (getMemoryInstrPtr(&I)) {
218  if (isIndirectAccess(&I))
219  ++FI.IAMInstCount;
220  if (isLargeStride(&I))
221  ++FI.LSMInstCount;
222  ++FI.MemInstCount;
223  ++FI.InstCount;
224  continue;
225  }
226  CallSite CS(const_cast<Instruction *>(&I));
227  if (CS) {
229  if (!Callee || Callee->isDeclaration()) {
230  ++FI.InstCount;
231  continue;
232  }
233  if (&F == Callee) // Handle immediate recursion
234  continue;
235 
236  visit(*Callee);
237  auto Loc = FIM.find(Callee);
238 
239  assert(Loc != FIM.end() && "No func info");
240  FI.MemInstCount += Loc->second.MemInstCount;
241  FI.InstCount += Loc->second.InstCount;
242  FI.IAMInstCount += Loc->second.IAMInstCount;
243  FI.LSMInstCount += Loc->second.LSMInstCount;
244  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
246  auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
247  AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
248  AM.HasBaseReg = !AM.BaseGV;
249  if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
250  GEP->getPointerAddressSpace()))
251  // Offset will likely be folded into load or store
252  continue;
253  ++FI.InstCount;
254  } else {
255  ++FI.InstCount;
256  }
257  }
258  }
259 }
260 
262  if (FIM.find(&F) != FIM.end())
263  return;
264 
265  const Module &M = *F.getParent();
266  DL = &M.getDataLayout();
267 
268  visit(F);
269  auto Loc = FIM.find(&F);
270 
271  assert(Loc != FIM.end() && "No func info");
272  LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Loc->second.MemInstCount
273  << '\n'
274  << " IAMInst: " << Loc->second.IAMInstCount << '\n'
275  << " LSMInst: " << Loc->second.LSMInstCount << '\n'
276  << " TotalInst: " << Loc->second.InstCount << '\n');
277 
278  auto &FI = Loc->second;
279 
280  if (isMemBound(FI)) {
281  LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
282  NumMemBound++;
283  }
284 
285  if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) {
286  LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
287  NumLimitWave++;
288  }
289 }
290 
291 bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
292  return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;
293 }
294 
295 bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
296  return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +
297  FI.LSMInstCount * LSWeight) *
298  100 / FI.InstCount) > LimitWaveThresh;
299 }
300 
301 bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
302  if (auto PT = dyn_cast<PointerType>(V->getType())) {
303  unsigned As = PT->getAddressSpace();
304  // Flat likely points to global too.
305  return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS;
306  }
307  return false;
308 }
309 
310 bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
311  if (auto PT = dyn_cast<PointerType>(V->getType()))
312  return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
313  return false;
314 }
315 
316 bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
317  LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');
318 
319  MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
320  bool IsLargeStride = MAI.isLargeStride(LastAccess);
321  if (MAI.Base)
322  LastAccess = std::move(MAI);
323 
324  return IsLargeStride;
325 }
326 
327 AMDGPUPerfHint::MemAccessInfo
328 AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
329  MemAccessInfo MAI;
330  const Value *MO = getMemoryInstrPtr(Inst);
331 
332  LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
333  // Do not treat local-addr memory access as large stride.
334  if (isLocalAddr(MO))
335  return MAI;
336 
337  MAI.V = MO;
338  MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
339  return MAI;
340 }
341 
342 bool AMDGPUPerfHint::isConstantAddr(const Value *V) const {
343  if (auto PT = dyn_cast<PointerType>(V->getType())) {
344  unsigned As = PT->getAddressSpace();
345  return As == AMDGPUAS::CONSTANT_ADDRESS ||
347  }
348  return false;
349 }
350 
351 bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
352  MemAccessInfo &Reference) const {
353 
354  if (!Base || !Reference.Base || Base != Reference.Base)
355  return false;
356 
357  uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
358  : Reference.Offset - Offset;
359  bool Result = Diff > LargeStrideThresh;
360  LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
361  << print() << "<=>\n"
362  << Reference.print() << "Result:" << Result << '\n');
363  return Result;
364 }
365 } // namespace
366 
368  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
369  if (!TPC)
370  return false;
371 
372  const TargetMachine &TM = TPC->getTM<TargetMachine>();
373  const TargetSubtargetInfo *ST = TM.getSubtargetImpl(F);
374 
375  AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
376  Analyzer.runOnFunction(F);
377  return false;
378 }
379 
381  auto FI = FIM.find(F);
382  if (FI == FIM.end())
383  return false;
384 
385  return AMDGPUPerfHint::isMemBound(FI->second);
386 }
387 
389  auto FI = FIM.find(F);
390  if (FI == FIM.end())
391  return false;
392 
393  return AMDGPUPerfHint::needLimitWave(FI->second);
394 }
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:110
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
This class represents lattice values for constants.
Definition: AllocatorList.h:23
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:65
bool needsWaveLimiter(const Function *F) const
static cl::opt< unsigned > LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, cl::desc("Kernel limit wave threshold in %"))
bool runOnFunction(Function &F) override
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass...
Address space for 32-bit constant memory.
Definition: AMDGPU.h:261
STATISTIC(NumFunctions, "Total number of functions")
F(f)
FunTy * getCalledFunction() const
Return the function being called if this is a direct call, otherwise return null (if it&#39;s an indirect...
Definition: CallSite.h:111
Hexagon Common GEP
bool erase(const T &V)
Definition: SmallSet.h:207
LLVM_NODISCARD bool empty() const
Definition: SmallSet.h:155
const_iterator begin() const
Definition: SmallSet.h:223
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:244
Address space for local memory.
Definition: AMDGPU.h:258
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset...
static cl::opt< unsigned > LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, cl::desc("Large stride memory access weight"))
static cl::opt< unsigned > LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, cl::desc("Large stride memory access threshold"))
static bool runOnFunction(Function &F, bool PostInlining)
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
This file contains the declarations for the subclasses of Constant, which represent the different fla...
bool isEntryFunctionCC(CallingConv::ID CC)
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:254
char & AMDGPUPerfHintAnalysisID
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static cl::opt< unsigned > IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, cl::desc("Indirect access memory instruction weight"))
INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, "Analysis if a function is memory bound", true, true) namespace
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn&#39;t already there.
Definition: SmallSet.h:180
Address space for constant memory (VTX2).
Definition: AMDGPU.h:257
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
static cl::opt< unsigned > MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, cl::desc("Function mem bound threshold in %"))
Address space for flat memory.
Definition: AMDGPU.h:253
Module.h This file contains the declarations for the Module class.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target&#39;s TargetSubtargetInf...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
#define DEBUG_TYPE
amdgpu Simplify well known AMD library false FunctionCallee Callee
TargetSubtargetInfo - Generic base class for all target subtargets.
bool isMemoryBound(const Function *F) const
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
#define I(x, y, z)
Definition: MD5.cpp:58
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:205
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:565
LLVM Value Representation.
Definition: Value.h:72
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:45
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:65
IRTranslator LLVM IR MI
Simple wrapper around std::function<void(raw_ostream&)>.
Definition: Printable.h:37
#define LLVM_DEBUG(X)
Definition: Debug.h:122
Analyzes if a function potentially memory bound and if a kernel kernel may benefit from limiting numb...
This file describes how to lower LLVM code to machine code.