LLVM  12.0.0git
AMDGPUPerfHintAnalysis.cpp
Go to the documentation of this file.
1 //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes if a function potentially memory bound and if a kernel
11 /// kernel may benefit from limiting number of waves to reduce cache thrashing.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUPerfHintAnalysis.h"
17 #include "Utils/AMDGPUBaseInfo.h"
18 #include "llvm/ADT/SmallSet.h"
19 #include "llvm/ADT/Statistic.h"
25 #include "llvm/IR/Instructions.h"
28 
29 using namespace llvm;
30 
31 #define DEBUG_TYPE "amdgpu-perf-hint"
32 
33 static cl::opt<unsigned>
34  MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden,
35  cl::desc("Function mem bound threshold in %"));
36 
37 static cl::opt<unsigned>
38  LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden,
39  cl::desc("Kernel limit wave threshold in %"));
40 
41 static cl::opt<unsigned>
42  IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden,
43  cl::desc("Indirect access memory instruction weight"));
44 
45 static cl::opt<unsigned>
46  LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden,
47  cl::desc("Large stride memory access weight"));
48 
49 static cl::opt<unsigned>
50  LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden,
51  cl::desc("Large stride memory access threshold"));
52 
53 STATISTIC(NumMemBound, "Number of functions marked as memory bound");
54 STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
55 
58 
60  "Analysis if a function is memory bound", true, true)
61 
62 namespace {
63 
64 struct AMDGPUPerfHint {
66 
67 public:
68  AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
69  const TargetLowering *TLI_)
70  : FIM(FIM_), DL(nullptr), TLI(TLI_) {}
71 
72  bool runOnFunction(Function &F);
73 
74 private:
75  struct MemAccessInfo {
76  const Value *V;
77  const Value *Base;
78  int64_t Offset;
79  MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {}
80  bool isLargeStride(MemAccessInfo &Reference) const;
81 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
82  Printable print() const {
83  return Printable([this](raw_ostream &OS) {
84  OS << "Value: " << *V << '\n'
85  << "Base: " << *Base << " Offset: " << Offset << '\n';
86  });
87  }
88 #endif
89  };
90 
91  MemAccessInfo makeMemAccessInfo(Instruction *) const;
92 
93  MemAccessInfo LastAccess; // Last memory access info
94 
96 
97  const DataLayout *DL;
98 
99  const TargetLowering *TLI;
100 
102  static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
103  static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
104 
105  bool isIndirectAccess(const Instruction *Inst) const;
106 
107  /// Check if the instruction is large stride.
108  /// The purpose is to identify memory access pattern like:
109  /// x = a[i];
110  /// y = a[i+1000];
111  /// z = a[i+2000];
112  /// In the above example, the second and third memory access will be marked
113  /// large stride memory access.
114  bool isLargeStride(const Instruction *Inst);
115 
116  bool isGlobalAddr(const Value *V) const;
117  bool isLocalAddr(const Value *V) const;
118  bool isConstantAddr(const Value *V) const;
119 };
120 
121 static const Value *getMemoryInstrPtr(const Instruction *Inst) {
122  if (auto LI = dyn_cast<LoadInst>(Inst)) {
123  return LI->getPointerOperand();
124  }
125  if (auto SI = dyn_cast<StoreInst>(Inst)) {
126  return SI->getPointerOperand();
127  }
128  if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
129  return AI->getPointerOperand();
130  }
131  if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) {
132  return AI->getPointerOperand();
133  }
134  if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
135  return MI->getRawDest();
136  }
137 
138  return nullptr;
139 }
140 
141 bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
142  LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
145  if (const Value *MO = getMemoryInstrPtr(Inst)) {
146  if (isGlobalAddr(MO))
147  WorkSet.insert(MO);
148  }
149 
150  while (!WorkSet.empty()) {
151  const Value *V = *WorkSet.begin();
152  WorkSet.erase(*WorkSet.begin());
153  if (!Visited.insert(V).second)
154  continue;
155  LLVM_DEBUG(dbgs() << " check: " << *V << '\n');
156 
157  if (auto LD = dyn_cast<LoadInst>(V)) {
158  auto M = LD->getPointerOperand();
159  if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) {
160  LLVM_DEBUG(dbgs() << " is IA\n");
161  return true;
162  }
163  continue;
164  }
165 
166  if (auto GEP = dyn_cast<GetElementPtrInst>(V)) {
167  auto P = GEP->getPointerOperand();
168  WorkSet.insert(P);
169  for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)
170  WorkSet.insert(GEP->getOperand(I));
171  continue;
172  }
173 
174  if (auto U = dyn_cast<UnaryInstruction>(V)) {
175  WorkSet.insert(U->getOperand(0));
176  continue;
177  }
178 
179  if (auto BO = dyn_cast<BinaryOperator>(V)) {
180  WorkSet.insert(BO->getOperand(0));
181  WorkSet.insert(BO->getOperand(1));
182  continue;
183  }
184 
185  if (auto S = dyn_cast<SelectInst>(V)) {
186  WorkSet.insert(S->getFalseValue());
187  WorkSet.insert(S->getTrueValue());
188  continue;
189  }
190 
191  if (auto E = dyn_cast<ExtractElementInst>(V)) {
192  WorkSet.insert(E->getVectorOperand());
193  continue;
194  }
195 
196  LLVM_DEBUG(dbgs() << " dropped\n");
197  }
198 
199  LLVM_DEBUG(dbgs() << " is not IA\n");
200  return false;
201 }
202 
203 AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
205 
206  LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
207 
208  for (auto &B : F) {
209  LastAccess = MemAccessInfo();
210  for (auto &I : B) {
211  if (getMemoryInstrPtr(&I)) {
212  if (isIndirectAccess(&I))
213  ++FI.IAMInstCount;
214  if (isLargeStride(&I))
215  ++FI.LSMInstCount;
216  ++FI.MemInstCount;
217  ++FI.InstCount;
218  continue;
219  }
220  if (auto *CB = dyn_cast<CallBase>(&I)) {
221  Function *Callee = CB->getCalledFunction();
222  if (!Callee || Callee->isDeclaration()) {
223  ++FI.InstCount;
224  continue;
225  }
226  if (&F == Callee) // Handle immediate recursion
227  continue;
228 
229  auto Loc = FIM.find(Callee);
230  if (Loc == FIM.end())
231  continue;
232 
233  FI.MemInstCount += Loc->second.MemInstCount;
234  FI.InstCount += Loc->second.InstCount;
235  FI.IAMInstCount += Loc->second.IAMInstCount;
236  FI.LSMInstCount += Loc->second.LSMInstCount;
237  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
239  auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
240  AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
241  AM.HasBaseReg = !AM.BaseGV;
242  if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
243  GEP->getPointerAddressSpace()))
244  // Offset will likely be folded into load or store
245  continue;
246  ++FI.InstCount;
247  } else {
248  ++FI.InstCount;
249  }
250  }
251  }
252 
253  return &FI;
254 }
255 
257  const Module &M = *F.getParent();
258  DL = &M.getDataLayout();
259 
260  if (F.hasFnAttribute("amdgpu-wave-limiter") &&
261  F.hasFnAttribute("amdgpu-memory-bound"))
262  return false;
263 
264  const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);
265 
266  LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount
267  << '\n'
268  << " IAMInst: " << Info->IAMInstCount << '\n'
269  << " LSMInst: " << Info->LSMInstCount << '\n'
270  << " TotalInst: " << Info->InstCount << '\n');
271 
272  if (isMemBound(*Info)) {
273  LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
274  NumMemBound++;
275  F.addFnAttr("amdgpu-memory-bound", "true");
276  }
277 
278  if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) {
279  LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
280  NumLimitWave++;
281  F.addFnAttr("amdgpu-wave-limiter", "true");
282  }
283 
284  return true;
285 }
286 
287 bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
288  return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;
289 }
290 
291 bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
292  return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +
293  FI.LSMInstCount * LSWeight) *
294  100 / FI.InstCount) > LimitWaveThresh;
295 }
296 
297 bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
298  if (auto PT = dyn_cast<PointerType>(V->getType())) {
299  unsigned As = PT->getAddressSpace();
300  // Flat likely points to global too.
301  return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS;
302  }
303  return false;
304 }
305 
306 bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
307  if (auto PT = dyn_cast<PointerType>(V->getType()))
308  return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
309  return false;
310 }
311 
312 bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
313  LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');
314 
315  MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
316  bool IsLargeStride = MAI.isLargeStride(LastAccess);
317  if (MAI.Base)
318  LastAccess = std::move(MAI);
319 
320  return IsLargeStride;
321 }
322 
323 AMDGPUPerfHint::MemAccessInfo
324 AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
325  MemAccessInfo MAI;
326  const Value *MO = getMemoryInstrPtr(Inst);
327 
328  LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
329  // Do not treat local-addr memory access as large stride.
330  if (isLocalAddr(MO))
331  return MAI;
332 
333  MAI.V = MO;
334  MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
335  return MAI;
336 }
337 
338 bool AMDGPUPerfHint::isConstantAddr(const Value *V) const {
339  if (auto PT = dyn_cast<PointerType>(V->getType())) {
340  unsigned As = PT->getAddressSpace();
341  return As == AMDGPUAS::CONSTANT_ADDRESS ||
343  }
344  return false;
345 }
346 
347 bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
348  MemAccessInfo &Reference) const {
349 
350  if (!Base || !Reference.Base || Base != Reference.Base)
351  return false;
352 
353  uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
354  : Reference.Offset - Offset;
355  bool Result = Diff > LargeStrideThresh;
356  LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
357  << print() << "<=>\n"
358  << Reference.print() << "Result:" << Result << '\n');
359  return Result;
360 }
361 } // namespace
362 
364  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
365  if (!TPC)
366  return false;
367 
368  const TargetMachine &TM = TPC->getTM<TargetMachine>();
369 
370  bool Changed = false;
371  for (CallGraphNode *I : SCC) {
372  Function *F = I->getFunction();
373  if (!F || F->isDeclaration())
374  continue;
375 
376  const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F);
377  AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
378 
379  if (Analyzer.runOnFunction(*F))
380  Changed = true;
381  }
382 
383  return Changed;
384 }
385 
387  auto FI = FIM.find(F);
388  if (FI == FIM.end())
389  return false;
390 
391  return AMDGPUPerfHint::isMemBound(FI->second);
392 }
393 
395  auto FI = FIM.find(F);
396  if (FI == FIM.end())
397  return false;
398 
399  return AMDGPUPerfHint::needLimitWave(FI->second);
400 }
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:111
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
Address space for constant memory (VTX2).
Definition: AMDGPU.h:368
Address space for local memory.
Definition: AMDGPU.h:369
bool needsWaveLimiter(const Function *F) const
static cl::opt< unsigned > LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, cl::desc("Kernel limit wave threshold in %"))
STATISTIC(NumFunctions, "Total number of functions")
F(f)
Hexagon Common GEP
bool erase(const T &V)
Definition: SmallSet.h:207
A node in the call graph for a module.
Definition: CallGraph.h:167
LLVM_NODISCARD bool empty() const
Definition: SmallSet.h:155
const_iterator begin() const
Definition: SmallSet.h:223
Address space for 32-bit constant memory.
Definition: AMDGPU.h:372
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:246
static cl::opt< unsigned > LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, cl::desc("Large stride memory access weight"))
iterator find(const KeyT &Val)
Definition: ValueMap.h:156
static cl::opt< unsigned > LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, cl::desc("Large stride memory access threshold"))
Analysis containing CSE Info
Definition: CSEInfo.cpp:25
static bool runOnFunction(Function &F, bool PostInlining)
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:427
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
bool runOnSCC(CallGraphSCC &SCC) override
runOnSCC - This method should be implemented by the subclass to perform whatever action is necessary ...
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:365
bool isEntryFunctionCC(CallingConv::ID CC)
char & AMDGPUPerfHintAnalysisID
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static cl::opt< unsigned > IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, cl::desc("Indirect access memory instruction weight"))
INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, "Analysis if a function is memory bound", true, true) namespace
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:180
iterator end()
Definition: ValueMap.h:136
uint64_t Offset
static cl::opt< unsigned > MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, cl::desc("Function mem bound threshold in %"))
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
#define DEBUG_TYPE
Address space for flat memory.
Definition: AMDGPU.h:364
amdgpu Simplify well known AMD library false FunctionCallee Callee
TargetSubtargetInfo - Generic base class for all target subtargets.
bool isMemoryBound(const Function *F) const
#define I(x, y, z)
Definition: MD5.cpp:59
LLVM Value Representation.
Definition: Value.h:75
CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:50
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
IRTranslator LLVM IR MI
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1556
Simple wrapper around std::function<void(raw_ostream&)>.
Definition: Printable.h:37
#define LLVM_DEBUG(X)
Definition: Debug.h:122
Analyzes if a function potentially memory bound and if a kernel kernel may benefit from limiting numb...
This file describes how to lower LLVM code to machine code.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL