35#define DEBUG_TYPE "amdgpu-perf-hint"
39 cl::desc(
"Function mem bound threshold in %"));
43 cl::desc(
"Kernel limit wave threshold in %"));
47 cl::desc(
"Indirect access memory instruction weight"));
51 cl::desc(
"Large stride memory access weight"));
55 cl::desc(
"Large stride memory access threshold"));
57STATISTIC(NumMemBound,
"Number of functions marked as memory bound");
58STATISTIC(NumLimitWave,
"Number of functions marked as needing limit wave");
62struct AMDGPUPerfHint {
68 : FIM(FIM_), TLI(TLI_) {}
73 struct MemAccessInfo {
77 MemAccessInfo() =
default;
78 bool isLargeStride(MemAccessInfo &Reference)
const;
79#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
82 OS <<
"Value: " << *
V <<
'\n'
83 <<
"Base: " << *
Base <<
" Offset: " <<
Offset <<
'\n';
89 MemAccessInfo makeMemAccessInfo(
Instruction *)
const;
91 MemAccessInfo LastAccess;
103 bool isIndirectAccess(
const Instruction *Inst)
const;
114 bool isGlobalAddr(
const Value *V)
const;
115 bool isLocalAddr(
const Value *V)
const;
116 bool isGlobalLoadUsedInBB(
const Instruction &)
const;
119static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
121 if (
const auto *LI = dyn_cast<LoadInst>(Inst))
122 return {LI->getPointerOperand(), LI->getType()};
123 if (
const auto *SI = dyn_cast<StoreInst>(Inst))
124 return {
SI->getPointerOperand(),
SI->getValueOperand()->getType()};
125 if (
const auto *AI = dyn_cast<AtomicCmpXchgInst>(Inst))
126 return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
127 if (
const auto *AI = dyn_cast<AtomicRMWInst>(Inst))
128 return {AI->getPointerOperand(), AI->getValOperand()->getType()};
129 if (
const auto *
MI = dyn_cast<AnyMemIntrinsic>(Inst))
132 return {
nullptr,
nullptr};
135bool AMDGPUPerfHint::isIndirectAccess(
const Instruction *Inst)
const {
139 if (
const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
140 if (isGlobalAddr(MO))
144 while (!WorkSet.
empty()) {
147 if (!Visited.
insert(V).second)
151 if (
const auto *LD = dyn_cast<LoadInst>(V)) {
152 const auto *
M =
LD->getPointerOperand();
153 if (isGlobalAddr(M)) {
160 if (
const auto *
GEP = dyn_cast<GetElementPtrInst>(V)) {
161 const auto *
P =
GEP->getPointerOperand();
163 for (
unsigned I = 1, E =
GEP->getNumIndices() + 1;
I != E; ++
I)
168 if (
const auto *U = dyn_cast<UnaryInstruction>(V)) {
169 WorkSet.
insert(
U->getOperand(0));
173 if (
const auto *BO = dyn_cast<BinaryOperator>(V)) {
174 WorkSet.
insert(BO->getOperand(0));
175 WorkSet.
insert(BO->getOperand(1));
179 if (
const auto *S = dyn_cast<SelectInst>(V)) {
180 WorkSet.
insert(S->getFalseValue());
181 WorkSet.
insert(S->getTrueValue());
185 if (
const auto *E = dyn_cast<ExtractElementInst>(V)) {
186 WorkSet.
insert(E->getVectorOperand());
198bool AMDGPUPerfHint::isGlobalLoadUsedInBB(
const Instruction &
I)
const {
199 const auto *Ld = dyn_cast<LoadInst>(&
I);
202 if (!isGlobalAddr(Ld->getPointerOperand()))
206 if (
const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
207 if (UsrInst->getParent() ==
I.getParent())
218 LLVM_DEBUG(
dbgs() <<
"[AMDGPUPerfHint] process " <<
F.getName() <<
'\n');
221 LastAccess = MemAccessInfo();
222 unsigned UsedGlobalLoadsInBB = 0;
224 if (
const Type *Ty = getMemoryInstrPtrAndType(&
I).second) {
228 if (isGlobalLoadUsedInBB(
I))
229 UsedGlobalLoadsInBB +=
Size;
230 if (isIndirectAccess(&
I))
232 if (isLargeStride(&
I))
238 if (
auto *CB = dyn_cast<CallBase>(&
I)) {
240 if (!Callee ||
Callee->isDeclaration()) {
247 auto Loc = FIM.find(Callee);
248 if (Loc == FIM.end())
252 FI.
InstCost += Loc->second.InstCost;
255 }
else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
258 AM.
BaseGV = dyn_cast_or_null<GlobalValue>(
const_cast<Value *
>(
Ptr));
260 if (TLI->isLegalAddressingMode(*
DL, AM,
GEP->getResultElementType(),
261 GEP->getPointerAddressSpace()))
271 unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 /
B.size();
272 if (GlobalMemAccPercentage > 50) {
274 <<
B.getName() <<
" has " << GlobalMemAccPercentage
275 <<
"% global memory access\n");
284bool AMDGPUPerfHint::runOnFunction(
Function &
F) {
286 DL = &
M.getDataLayout();
288 if (
F.hasFnAttribute(
"amdgpu-wave-limiter") &&
289 F.hasFnAttribute(
"amdgpu-memory-bound"))
296 <<
" IAMInst cost: " <<
Info->IAMInstCost <<
'\n'
297 <<
" LSMInst cost: " <<
Info->LSMInstCost <<
'\n'
298 <<
" TotalInst cost: " <<
Info->InstCost <<
'\n');
300 bool Changed =
false;
302 if (isMemBound(*Info)) {
305 F.addFnAttr(
"amdgpu-memory-bound",
"true");
312 F.addFnAttr(
"amdgpu-wave-limiter",
"true");
333bool AMDGPUPerfHint::isGlobalAddr(
const Value *V)
const {
334 if (
auto *PT = dyn_cast<PointerType>(
V->getType())) {
335 unsigned As = PT->getAddressSpace();
342bool AMDGPUPerfHint::isLocalAddr(
const Value *V)
const {
343 if (
auto *PT = dyn_cast<PointerType>(
V->getType()))
348bool AMDGPUPerfHint::isLargeStride(
const Instruction *Inst) {
351 MemAccessInfo MAI = makeMemAccessInfo(
const_cast<Instruction *
>(Inst));
352 bool IsLargeStride = MAI.isLargeStride(LastAccess);
354 LastAccess = std::move(MAI);
356 return IsLargeStride;
359AMDGPUPerfHint::MemAccessInfo
360AMDGPUPerfHint::makeMemAccessInfo(
Instruction *Inst)
const {
362 const Value *MO = getMemoryInstrPtrAndType(Inst).first;
374bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
375 MemAccessInfo &Reference)
const {
384 <<
print() <<
"<=>\n"
385 <<
Reference.print() <<
"Result:" << Result <<
'\n');
409 auto FI = FIM.
find(
F);
413 return AMDGPUPerfHint::isMemBound(FI->second);
417 auto FI = FIM.
find(
F);
421 return AMDGPUPerfHint::needLimitWave(FI->second);
426 bool Changed =
false;
429 if (!
F ||
F->isDeclaration())
433 AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering());
435 if (Analyzer.runOnFunction(*
F))
444 bool Changed =
false;
452 Function &
F = SCC.begin()->getFunction();
454 if (
F.isDeclaration())
458 AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering());
459 if (Analyzer.runOnFunction(
F))
467char AMDGPUPerfHintAnalysisLegacy::ID = 0;
471 "Analysis if a function is memory bound",
true,
true)
474 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
479 return Impl.runOnSCC(TM, SCC);
486 bool Changed =
Impl->run(
TM, CG);
static cl::opt< unsigned > LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, cl::desc("Large stride memory access threshold"))
static cl::opt< unsigned > IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, cl::desc("Indirect access memory instruction weight"))
static cl::opt< unsigned > LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, cl::desc("Kernel limit wave threshold in %"))
static cl::opt< unsigned > LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, cl::desc("Large stride memory access weight"))
static cl::opt< unsigned > MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, cl::desc("Function mem bound threshold in %"))
Analyzes if a function potentially memory bound and if a kernel kernel may benefit from limiting numb...
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
static bool runOnFunction(Function &F, bool PostInlining)
Implements a lazy call graph analysis and related passes for the new pass manager.
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
This file defines the SmallSet class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
bool isMemoryBound(const Function *F) const
bool needsWaveLimiter(const Function *F) const
bool run(const GCNTargetMachine &TM, LazyCallGraph &CG)
bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC)
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
void setPreservesAll()
Set by analyses that do not transform their input at all.
A node in the call graph for a module.
virtual bool runOnSCC(CallGraphSCC &SCC)=0
runOnSCC - This method should be implemented by the subclass to perform whatever action is necessary ...
void getAnalysisUsage(AnalysisUsage &Info) const override
getAnalysisUsage - For this class, we declare that we require and preserve the call graph.
CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
A parsed version of the target data layout string in and methods for querying it.
An analysis pass which computes the call graph for a module.
A RefSCC of the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
iterator_range< postorder_ref_scc_iterator > postorder_ref_sccs()
A Module instance is used to store all the information related to an LLVM module.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserve()
Mark an analysis as preserved.
Simple wrapper around std::function<void(raw_ostream&)>.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
const_iterator begin() const
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
The instances of the Type class are immutable: once they are created, they are never changed.
static IntegerType * getInt8Ty(LLVMContext &C)
iterator find(const KeyT &Val)
LLVM Value Representation.
iterator_range< user_iterator > users()
This class implements an extremely fast bulk output stream that can only output to a stream.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isEntryFunctionCC(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
char & AMDGPUPerfHintAnalysisLegacyID
std::unique_ptr< AMDGPUPerfHintAnalysis > Impl
const GCNTargetMachine & TM
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
bool HasDenseGlobalMemAcc
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...