Line data Source code
1 : //===- AMDGPUInline.cpp - Code to perform simple function inlining --------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : /// \file
11 : /// This is AMDGPU specific replacement of the standard inliner.
12 : /// The main purpose is to account for the fact that calls not only expensive
13 : /// on the AMDGPU, but much more expensive if a private memory pointer is
14 : /// passed to a function as an argument. In this situation, we are unable to
15 : /// eliminate private memory in the caller unless inlined and end up with slow
16 : /// and expensive scratch access. Thus, we boost the inline threshold for such
17 : /// functions here.
18 : ///
19 : //===----------------------------------------------------------------------===//
20 :
21 :
22 : #include "AMDGPU.h"
23 : #include "llvm/Transforms/IPO.h"
24 : #include "llvm/Analysis/AssumptionCache.h"
25 : #include "llvm/Analysis/CallGraph.h"
26 : #include "llvm/Analysis/InlineCost.h"
27 : #include "llvm/Analysis/ValueTracking.h"
28 : #include "llvm/Analysis/TargetTransformInfo.h"
29 : #include "llvm/IR/CallSite.h"
30 : #include "llvm/IR/DataLayout.h"
31 : #include "llvm/IR/Instructions.h"
32 : #include "llvm/IR/Module.h"
33 : #include "llvm/IR/Type.h"
34 : #include "llvm/Support/CommandLine.h"
35 : #include "llvm/Support/Debug.h"
36 : #include "llvm/Transforms/IPO/Inliner.h"
37 :
38 : using namespace llvm;
39 :
40 : #define DEBUG_TYPE "inline"
41 :
42 : static cl::opt<int>
43 : ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200),
44 : cl::desc("Cost of alloca argument"));
45 :
46 : // If the amount of scratch memory to eliminate exceeds our ability to allocate
47 : // it into registers we gain nothing by agressively inlining functions for that
48 : // heuristic.
49 : static cl::opt<unsigned>
50 : ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
51 : cl::desc("Maximum alloca size to use for inline cost"));
52 :
53 : namespace {
54 :
55 : class AMDGPUInliner : public LegacyInlinerBase {
56 :
57 : public:
58 2 : AMDGPUInliner() : LegacyInlinerBase(ID) {
59 2 : initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry());
60 2 : Params = getInlineParams();
61 2 : }
62 :
63 : static char ID; // Pass identification, replacement for typeid
64 :
65 : unsigned getInlineThreshold(CallSite CS) const;
66 :
67 : InlineCost getInlineCost(CallSite CS) override;
68 :
69 : bool runOnSCC(CallGraphSCC &SCC) override;
70 :
71 : void getAnalysisUsage(AnalysisUsage &AU) const override;
72 :
73 : private:
74 : TargetTransformInfoWrapperPass *TTIWP;
75 :
76 : InlineParams Params;
77 : };
78 :
79 : } // end anonymous namespace
80 :
81 : char AMDGPUInliner::ID = 0;
82 85105 : INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline",
83 : "AMDGPU Function Integration/Inlining", false, false)
84 85105 : INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
85 85105 : INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
86 85105 : INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
87 85105 : INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
88 85105 : INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
89 199026 : INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline",
90 : "AMDGPU Function Integration/Inlining", false, false)
91 :
92 2 : Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); }
93 :
94 24 : bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) {
95 24 : TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
96 24 : return LegacyInlinerBase::runOnSCC(SCC);
97 : }
98 :
99 2 : void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const {
100 : AU.addRequired<TargetTransformInfoWrapperPass>();
101 2 : LegacyInlinerBase::getAnalysisUsage(AU);
102 2 : }
103 :
104 11 : unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
105 11 : int Thres = Params.DefaultThreshold;
106 :
107 : Function *Caller = CS.getCaller();
108 : // Listen to the inlinehint attribute when it would increase the threshold
109 : // and the caller does not need to minimize its size.
110 : Function *Callee = CS.getCalledFunction();
111 22 : bool InlineHint = Callee && !Callee->isDeclaration() &&
112 : Callee->hasFnAttribute(Attribute::InlineHint);
113 0 : if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres
114 0 : && !Caller->hasFnAttribute(Attribute::MinSize))
115 0 : Thres = Params.HintThreshold.getValue();
116 :
117 11 : const DataLayout &DL = Caller->getParent()->getDataLayout();
118 11 : if (!Callee)
119 0 : return (unsigned)Thres;
120 :
121 : // If we have a pointer to private array passed into a function
122 : // it will not be optimized out, leaving scratch usage.
123 : // Increase the inline threshold to allow inliniting in this case.
124 : uint64_t AllocaSize = 0;
125 : SmallPtrSet<const AllocaInst *, 8> AIVisited;
126 29 : for (Value *PtrArg : CS.args()) {
127 20 : Type *Ty = PtrArg->getType();
128 20 : if (!Ty->isPointerTy() ||
129 : Ty->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
130 : continue;
131 14 : PtrArg = GetUnderlyingObject(PtrArg, DL);
132 : if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
133 14 : if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
134 2 : continue;
135 12 : AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
136 : // If the amount of stack memory is excessive we will not be able
137 : // to get rid of the scratch anyway, bail out.
138 12 : if (AllocaSize > ArgAllocaCutoff) {
139 : AllocaSize = 0;
140 : break;
141 : }
142 : }
143 : }
144 11 : if (AllocaSize)
145 6 : Thres += ArgAllocaCost;
146 :
147 11 : return (unsigned)Thres;
148 : }
149 :
150 : // Check if call is just a wrapper around another call.
151 : // In this case we only have call and ret instructions.
152 13 : static bool isWrapperOnlyCall(CallSite CS) {
153 : Function *Callee = CS.getCalledFunction();
154 13 : if (!Callee || Callee->size() != 1)
155 : return false;
156 : const BasicBlock &BB = Callee->getEntryBlock();
157 5 : if (const Instruction *I = BB.getFirstNonPHI()) {
158 5 : if (!isa<CallInst>(I)) {
159 : return false;
160 : }
161 4 : if (isa<ReturnInst>(*std::next(I->getIterator()))) {
162 : LLVM_DEBUG(dbgs() << " Wrapper only call detected: "
163 : << Callee->getName() << '\n');
164 2 : return true;
165 : }
166 : }
167 : return false;
168 : }
169 :
170 17 : InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
171 : Function *Callee = CS.getCalledFunction();
172 : Function *Caller = CS.getCaller();
173 17 : TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
174 :
175 17 : if (!Callee || Callee->isDeclaration())
176 : return llvm::InlineCost::getNever("undefined callee");
177 :
178 17 : if (CS.isNoInline())
179 : return llvm::InlineCost::getNever("noinline");
180 :
181 13 : if (!TTI.areInlineCompatible(Caller, Callee))
182 : return llvm::InlineCost::getNever("incompatible");
183 :
184 13 : if (CS.hasFnAttr(Attribute::AlwaysInline)) {
185 0 : if (isInlineViable(*Callee))
186 : return llvm::InlineCost::getAlways("alwaysinline viable");
187 : return llvm::InlineCost::getNever("alwaysinline unviable");
188 : }
189 :
190 13 : if (isWrapperOnlyCall(CS))
191 : return llvm::InlineCost::getAlways("wrapper-only call");
192 :
193 11 : InlineParams LocalParams = Params;
194 11 : LocalParams.DefaultThreshold = (int)getInlineThreshold(CS);
195 : bool RemarksEnabled = false;
196 : const auto &BBs = Caller->getBasicBlockList();
197 11 : if (!BBs.empty()) {
198 11 : auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front());
199 11 : if (DI.isEnabled())
200 : RemarksEnabled = true;
201 : }
202 :
203 11 : OptimizationRemarkEmitter ORE(Caller);
204 : std::function<AssumptionCache &(Function &)> GetAssumptionCache =
205 : [this](Function &F) -> AssumptionCache & {
206 8 : return ACT->getAssumptionCache(F);
207 : };
208 :
209 : return llvm::getInlineCost(CS, Callee, LocalParams, TTI, GetAssumptionCache,
210 33 : None, PSI, RemarksEnabled ? &ORE : nullptr);
211 : }
|