Go to the documentation of this file.
32 #define DEBUG_TYPE "amdgpu-perf-hint"
36 cl::desc(
"Function mem bound threshold in %"));
40 cl::desc(
"Kernel limit wave threshold in %"));
44 cl::desc(
"Indirect access memory instruction weight"));
48 cl::desc(
"Large stride memory access weight"));
52 cl::desc(
"Large stride memory access threshold"));
54 STATISTIC(NumMemBound,
"Number of functions marked as memory bound");
55 STATISTIC(NumLimitWave,
"Number of functions marked as needing limit wave");
61 "Analysis if a function is memory bound",
true,
true)
65 struct AMDGPUPerfHint {
71 : FIM(FIM_),
DL(
nullptr), TLI(TLI_) {}
76 struct MemAccessInfo {
77 const Value *V =
nullptr;
80 MemAccessInfo() =
default;
81 bool isLargeStride(MemAccessInfo &Reference)
const;
82 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
85 OS <<
"Value: " << *V <<
'\n'
86 <<
"Base: " << *
Base <<
" Offset: " <<
Offset <<
'\n';
92 MemAccessInfo makeMemAccessInfo(
Instruction *)
const;
94 MemAccessInfo LastAccess;
106 bool isIndirectAccess(
const Instruction *Inst)
const;
117 bool isGlobalAddr(
const Value *V)
const;
118 bool isLocalAddr(
const Value *V)
const;
119 bool isGlobalLoadUsedInBB(
const Instruction &)
const;
122 static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
124 if (
auto LI = dyn_cast<LoadInst>(Inst))
125 return {LI->getPointerOperand(), LI->getType()};
126 if (
auto SI = dyn_cast<StoreInst>(Inst))
127 return {
SI->getPointerOperand(),
SI->getValueOperand()->getType()};
128 if (
auto AI = dyn_cast<AtomicCmpXchgInst>(Inst))
129 return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
130 if (
auto AI = dyn_cast<AtomicRMWInst>(Inst))
131 return {AI->getPointerOperand(), AI->getValOperand()->getType()};
132 if (
auto MI = dyn_cast<AnyMemIntrinsic>(Inst))
135 return {
nullptr,
nullptr};
138 bool AMDGPUPerfHint::isIndirectAccess(
const Instruction *Inst)
const {
142 if (
const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
143 if (isGlobalAddr(MO))
147 while (!WorkSet.
empty()) {
150 if (!Visited.
insert(V).second)
154 if (
auto LD = dyn_cast<LoadInst>(V)) {
155 auto M =
LD->getPointerOperand();
156 if (isGlobalAddr(
M)) {
163 if (
auto GEP = dyn_cast<GetElementPtrInst>(V)) {
164 auto P =
GEP->getPointerOperand();
166 for (
unsigned I = 1,
E =
GEP->getNumIndices() + 1;
I !=
E; ++
I)
171 if (
auto U = dyn_cast<UnaryInstruction>(V)) {
172 WorkSet.
insert(U->getOperand(0));
176 if (
auto BO = dyn_cast<BinaryOperator>(V)) {
177 WorkSet.
insert(BO->getOperand(0));
178 WorkSet.
insert(BO->getOperand(1));
182 if (
auto S = dyn_cast<SelectInst>(V)) {
183 WorkSet.
insert(
S->getFalseValue());
184 WorkSet.
insert(
S->getTrueValue());
188 if (
auto E = dyn_cast<ExtractElementInst>(V)) {
189 WorkSet.
insert(
E->getVectorOperand());
201 bool AMDGPUPerfHint::isGlobalLoadUsedInBB(
const Instruction &
I)
const {
202 const auto *Ld = dyn_cast<LoadInst>(&
I);
205 if (!isGlobalAddr(Ld->getPointerOperand()))
209 if (
const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
210 if (UsrInst->getParent() ==
I.getParent())
221 LLVM_DEBUG(
dbgs() <<
"[AMDGPUPerfHint] process " <<
F.getName() <<
'\n');
224 LastAccess = MemAccessInfo();
225 unsigned UsedGlobalLoadsInBB = 0;
227 if (
const Type *Ty = getMemoryInstrPtrAndType(&
I).second) {
228 unsigned Size =
divideCeil(Ty->getPrimitiveSizeInBits(), 32);
231 if (isGlobalLoadUsedInBB(
I))
232 UsedGlobalLoadsInBB += Size;
233 if (isIndirectAccess(&
I))
235 if (isLargeStride(&
I))
241 if (
auto *CB = dyn_cast<CallBase>(&
I)) {
250 auto Loc = FIM.find(
Callee);
251 if (Loc == FIM.end())
255 FI.
InstCost += Loc->second.InstCost;
258 }
else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
261 AM.
BaseGV = dyn_cast_or_null<GlobalValue>(
const_cast<Value *
>(
Ptr));
263 if (TLI->isLegalAddressingMode(*
DL, AM,
GEP->getResultElementType(),
264 GEP->getPointerAddressSpace()))
274 unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 /
B.size();
275 if (GlobalMemAccPercentage > 50) {
277 <<
B.getName() <<
" has " << GlobalMemAccPercentage
278 <<
"% global memory access\n");
289 DL = &
M.getDataLayout();
291 if (
F.hasFnAttribute(
"amdgpu-wave-limiter") &&
292 F.hasFnAttribute(
"amdgpu-memory-bound"))
299 <<
" IAMInst cost: " <<
Info->IAMInstCost <<
'\n'
300 <<
" LSMInst cost: " <<
Info->LSMInstCost <<
'\n'
301 <<
" TotalInst cost: " <<
Info->InstCost <<
'\n');
303 bool Changed =
false;
305 if (isMemBound(*
Info)) {
308 F.addFnAttr(
"amdgpu-memory-bound",
"true");
315 F.addFnAttr(
"amdgpu-wave-limiter",
"true");
336 bool AMDGPUPerfHint::isGlobalAddr(
const Value *V)
const {
337 if (
auto PT = dyn_cast<PointerType>(V->
getType())) {
338 unsigned As = PT->getAddressSpace();
345 bool AMDGPUPerfHint::isLocalAddr(
const Value *V)
const {
346 if (
auto PT = dyn_cast<PointerType>(V->
getType()))
351 bool AMDGPUPerfHint::isLargeStride(
const Instruction *Inst) {
354 MemAccessInfo MAI = makeMemAccessInfo(
const_cast<Instruction *
>(Inst));
355 bool IsLargeStride = MAI.isLargeStride(LastAccess);
359 return IsLargeStride;
362 AMDGPUPerfHint::MemAccessInfo
363 AMDGPUPerfHint::makeMemAccessInfo(
Instruction *Inst)
const {
365 const Value *MO = getMemoryInstrPtrAndType(Inst).first;
377 bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
378 MemAccessInfo &Reference)
const {
380 if (!
Base || !Reference.Base ||
Base != Reference.Base)
384 : Reference.Offset -
Offset;
387 <<
print() <<
"<=>\n"
388 << Reference.print() <<
"Result:" << Result <<
'\n');
394 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
400 bool Changed =
false;
403 if (!
F ||
F->isDeclaration())
407 AMDGPUPerfHint Analyzer(FIM,
ST->getTargetLowering());
409 if (Analyzer.runOnFunction(*
F))
417 auto FI = FIM.
find(
F);
421 return AMDGPUPerfHint::isMemBound(FI->second);
425 auto FI = FIM.
find(
F);
429 return AMDGPUPerfHint::needLimitWave(FI->second);
bool isMemoryBound(const Function *F) const
This is an optimization pass for GlobalISel generic memory operations.
static cl::opt< unsigned > LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, cl::desc("Large stride memory access threshold"))
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
A parsed version of the target data layout string in and methods for querying it.
@ LOCAL_ADDRESS
Address space for local memory.
bool runOnSCC(CallGraphSCC &SCC) override
runOnSCC - This method should be implemented by the subclass to perform whatever action is necessary ...
static cl::opt< unsigned > LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, cl::desc("Kernel limit wave threshold in %"))
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
static cl::opt< unsigned > IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, cl::desc("Indirect access memory instruction weight"))
static cl::opt< unsigned > LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, cl::desc("Large stride memory access weight"))
The instances of the Type class are immutable: once they are created, they are never changed.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
bool needsWaveLimiter(const Function *F) const
static IntegerType * getInt8Ty(LLVMContext &C)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
STATISTIC(NumFunctions, "Total number of functions")
This class implements an extremely fast bulk output stream that can only output to a stream.
A node in the call graph for a module.
Analysis containing CSE Info
bool isEntryFunctionCC(CallingConv::ID CC)
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool HasDenseGlobalMemAcc
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Analyzes if a function potentially memory bound and if a kernel kernel may benefit from limiting numb...
static cl::opt< unsigned > MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, cl::desc("Function mem bound threshold in %"))
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
initializer< Ty > init(const Ty &Val)
Primary interface to the complete machine description for the target machine.
const_iterator begin() const
INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, "Analysis if a function is memory bound", true, true) namespace
A Module instance is used to store all the information related to an LLVM module.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr)
Type * getType() const
All values are typed, get the type of this value.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
TargetSubtargetInfo - Generic base class for all target subtargets.
amdgpu Simplify well known AMD library false FunctionCallee Callee
static bool runOnFunction(Function &F, bool PostInlining)
iterator find(const KeyT &Val)
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Simple wrapper around std::function<void(raw_ostream&)>.
@ FLAT_ADDRESS
Address space for flat memory.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
const char LLVMTargetMachineRef TM
char & AMDGPUPerfHintAnalysisID
LLVM Value Representation.
iterator_range< user_iterator > users()