32#define DEBUG_TYPE "amdgpu-perf-hint"
36 cl::desc(
"Function mem bound threshold in %"));
40 cl::desc(
"Kernel limit wave threshold in %"));
44 cl::desc(
"Indirect access memory instruction weight"));
48 cl::desc(
"Large stride memory access weight"));
52 cl::desc(
"Large stride memory access threshold"));
54STATISTIC(NumMemBound,
"Number of functions marked as memory bound");
55STATISTIC(NumLimitWave,
"Number of functions marked as needing limit wave");
61 "Analysis if a function is memory bound",
true,
true)
65struct AMDGPUPerfHint {
71 : FIM(FIM_),
DL(
nullptr), TLI(TLI_) {}
76 struct MemAccessInfo {
77 const Value *V =
nullptr;
80 MemAccessInfo() =
default;
81 bool isLargeStride(MemAccessInfo &Reference)
const;
82#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
85 OS <<
"Value: " << *V <<
'\n'
86 <<
"Base: " << *
Base <<
" Offset: " <<
Offset <<
'\n';
92 MemAccessInfo makeMemAccessInfo(
Instruction *)
const;
94 MemAccessInfo LastAccess;
106 bool isIndirectAccess(
const Instruction *Inst)
const;
117 bool isGlobalAddr(
const Value *V)
const;
118 bool isLocalAddr(
const Value *V)
const;
119 bool isGlobalLoadUsedInBB(
const Instruction &)
const;
122static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
124 if (
auto LI = dyn_cast<LoadInst>(Inst))
125 return {LI->getPointerOperand(), LI->getType()};
126 if (
auto SI = dyn_cast<StoreInst>(Inst))
127 return {
SI->getPointerOperand(),
SI->getValueOperand()->getType()};
128 if (
auto AI = dyn_cast<AtomicCmpXchgInst>(Inst))
129 return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
130 if (
auto AI = dyn_cast<AtomicRMWInst>(Inst))
131 return {AI->getPointerOperand(), AI->getValOperand()->getType()};
132 if (
auto MI = dyn_cast<AnyMemIntrinsic>(Inst))
135 return {
nullptr,
nullptr};
138bool AMDGPUPerfHint::isIndirectAccess(
const Instruction *Inst)
const {
142 if (
const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
143 if (isGlobalAddr(MO))
147 while (!WorkSet.
empty()) {
150 if (!Visited.
insert(V).second)
154 if (
auto LD = dyn_cast<LoadInst>(V)) {
155 auto M = LD->getPointerOperand();
156 if (isGlobalAddr(M)) {
163 if (
auto GEP = dyn_cast<GetElementPtrInst>(V)) {
164 auto P =
GEP->getPointerOperand();
166 for (
unsigned I = 1,
E =
GEP->getNumIndices() + 1;
I !=
E; ++
I)
171 if (
auto U = dyn_cast<UnaryInstruction>(V)) {
172 WorkSet.
insert(U->getOperand(0));
176 if (
auto BO = dyn_cast<BinaryOperator>(V)) {
177 WorkSet.
insert(BO->getOperand(0));
178 WorkSet.
insert(BO->getOperand(1));
182 if (
auto S = dyn_cast<SelectInst>(V)) {
183 WorkSet.
insert(S->getFalseValue());
184 WorkSet.
insert(S->getTrueValue());
188 if (
auto E = dyn_cast<ExtractElementInst>(V)) {
189 WorkSet.
insert(
E->getVectorOperand());
201bool AMDGPUPerfHint::isGlobalLoadUsedInBB(
const Instruction &
I)
const {
202 const auto *Ld = dyn_cast<LoadInst>(&
I);
205 if (!isGlobalAddr(Ld->getPointerOperand()))
209 if (
const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
210 if (UsrInst->getParent() ==
I.getParent())
221 LLVM_DEBUG(
dbgs() <<
"[AMDGPUPerfHint] process " <<
F.getName() <<
'\n');
224 LastAccess = MemAccessInfo();
225 unsigned UsedGlobalLoadsInBB = 0;
227 if (
const Type *Ty = getMemoryInstrPtrAndType(&
I).second) {
231 if (isGlobalLoadUsedInBB(
I))
232 UsedGlobalLoadsInBB +=
Size;
233 if (isIndirectAccess(&
I))
235 if (isLargeStride(&
I))
241 if (
auto *CB = dyn_cast<CallBase>(&
I)) {
250 auto Loc = FIM.find(
Callee);
251 if (Loc == FIM.end())
255 FI.
InstCost += Loc->second.InstCost;
258 }
else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
261 AM.
BaseGV = dyn_cast_or_null<GlobalValue>(
const_cast<Value *
>(
Ptr));
263 if (TLI->isLegalAddressingMode(*
DL, AM,
GEP->getResultElementType(),
264 GEP->getPointerAddressSpace()))
274 unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 /
B.size();
275 if (GlobalMemAccPercentage > 50) {
277 <<
B.getName() <<
" has " << GlobalMemAccPercentage
278 <<
"% global memory access\n");
287bool AMDGPUPerfHint::runOnFunction(
Function &
F) {
288 const Module &M = *
F.getParent();
289 DL = &M.getDataLayout();
291 if (
F.hasFnAttribute(
"amdgpu-wave-limiter") &&
292 F.hasFnAttribute(
"amdgpu-memory-bound"))
299 <<
" IAMInst cost: " <<
Info->IAMInstCost <<
'\n'
300 <<
" LSMInst cost: " <<
Info->LSMInstCost <<
'\n'
301 <<
" TotalInst cost: " <<
Info->InstCost <<
'\n');
303 bool Changed =
false;
305 if (isMemBound(*
Info)) {
308 F.addFnAttr(
"amdgpu-memory-bound",
"true");
315 F.addFnAttr(
"amdgpu-wave-limiter",
"true");
336bool AMDGPUPerfHint::isGlobalAddr(
const Value *V)
const {
337 if (
auto PT = dyn_cast<PointerType>(V->getType())) {
338 unsigned As = PT->getAddressSpace();
345bool AMDGPUPerfHint::isLocalAddr(
const Value *V)
const {
346 if (
auto PT = dyn_cast<PointerType>(V->getType()))
351bool AMDGPUPerfHint::isLargeStride(
const Instruction *Inst) {
354 MemAccessInfo MAI = makeMemAccessInfo(
const_cast<Instruction *
>(Inst));
355 bool IsLargeStride = MAI.isLargeStride(LastAccess);
357 LastAccess = std::move(MAI);
359 return IsLargeStride;
362AMDGPUPerfHint::MemAccessInfo
363AMDGPUPerfHint::makeMemAccessInfo(
Instruction *Inst)
const {
365 const Value *MO = getMemoryInstrPtrAndType(Inst).first;
377bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
378 MemAccessInfo &Reference)
const {
380 if (!
Base || !Reference.Base ||
Base != Reference.Base)
384 : Reference.Offset -
Offset;
387 <<
print() <<
"<=>\n"
388 << Reference.print() <<
"Result:" << Result <<
'\n');
394 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
400 bool Changed =
false;
403 if (!
F ||
F->isDeclaration())
407 AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
409 if (Analyzer.runOnFunction(*
F))
417 auto FI = FIM.
find(
F);
421 return AMDGPUPerfHint::isMemBound(FI->second);
425 auto FI = FIM.
find(
F);
429 return AMDGPUPerfHint::needLimitWave(FI->second);
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
amdgpu Simplify well known AMD library false FunctionCallee Callee
static cl::opt< unsigned > LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, cl::desc("Large stride memory access threshold"))
static cl::opt< unsigned > IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, cl::desc("Indirect access memory instruction weight"))
static cl::opt< unsigned > LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, cl::desc("Kernel limit wave threshold in %"))
static cl::opt< unsigned > LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, cl::desc("Large stride memory access weight"))
static cl::opt< unsigned > MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, cl::desc("Function mem bound threshold in %"))
Analyzes if a function potentially memory bound and if a kernel kernel may benefit from limiting numb...
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
static bool runOnFunction(Function &F, bool PostInlining)
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
This file defines the SmallSet class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
A node in the call graph for a module.
CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
A parsed version of the target data layout string in and methods for querying it.
A Module instance is used to store all the information related to an LLVM module.
Simple wrapper around std::function<void(raw_ostream&)>.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
const_iterator begin() const
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Primary interface to the complete machine description for the target machine.
TargetSubtargetInfo - Generic base class for all target subtargets.
The instances of the Type class are immutable: once they are created, they are never changed.
static IntegerType * getInt8Ty(LLVMContext &C)
iterator find(const KeyT &Val)
LLVM Value Representation.
iterator_range< user_iterator > users()
This class implements an extremely fast bulk output stream that can only output to a stream.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isEntryFunctionCC(CallingConv::ID CC)
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
char & AMDGPUPerfHintAnalysisID
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool HasDenseGlobalMemAcc
bool runOnSCC(CallGraphSCC &SCC) override
runOnSCC - This method should be implemented by the subclass to perform whatever action is necessary ...
bool isMemoryBound(const Function *F) const
bool needsWaveLimiter(const Function *F) const
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...