LLVM  9.0.0svn
AMDGPUInline.cpp
Go to the documentation of this file.
1 //===- AMDGPUInline.cpp - Code to perform simple function inlining --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is AMDGPU specific replacement of the standard inliner.
11 /// The main purpose is to account for the fact that calls not only expensive
12 /// on the AMDGPU, but much more expensive if a private memory pointer is
13 /// passed to a function as an argument. In this situation, we are unable to
14 /// eliminate private memory in the caller unless inlined and end up with slow
15 /// and expensive scratch access. Thus, we boost the inline threshold for such
16 /// functions here.
17 ///
18 //===----------------------------------------------------------------------===//
19 
20 
21 #include "AMDGPU.h"
22 #include "llvm/Transforms/IPO.h"
28 #include "llvm/IR/CallSite.h"
29 #include "llvm/IR/DataLayout.h"
30 #include "llvm/IR/Instructions.h"
31 #include "llvm/IR/Module.h"
32 #include "llvm/IR/Type.h"
34 #include "llvm/Support/Debug.h"
36 
37 using namespace llvm;
38 
39 #define DEBUG_TYPE "inline"
40 
41 static cl::opt<int>
42 ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(1500),
43  cl::desc("Cost of alloca argument"));
44 
45 // If the amount of scratch memory to eliminate exceeds our ability to allocate
46 // it into registers we gain nothing by aggressively inlining functions for that
47 // heuristic.
48 static cl::opt<unsigned>
49 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
50  cl::desc("Maximum alloca size to use for inline cost"));
51 
52 // Inliner constraint to achieve reasonable compilation time
53 static cl::opt<size_t>
54 MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(300),
55  cl::desc("Maximum BB number allowed in a function after inlining"
56  " (compile time constraint)"));
57 
58 namespace {
59 
60 class AMDGPUInliner : public LegacyInlinerBase {
61 
62 public:
63  AMDGPUInliner() : LegacyInlinerBase(ID) {
65  Params = getInlineParams();
66  }
67 
68  static char ID; // Pass identification, replacement for typeid
69 
70  unsigned getInlineThreshold(CallSite CS) const;
71 
72  InlineCost getInlineCost(CallSite CS) override;
73 
74  bool runOnSCC(CallGraphSCC &SCC) override;
75 
76  void getAnalysisUsage(AnalysisUsage &AU) const override;
77 
78 private:
80 
81  InlineParams Params;
82 };
83 
84 } // end anonymous namespace
85 
86 char AMDGPUInliner::ID = 0;
87 INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline",
88  "AMDGPU Function Integration/Inlining", false, false)
94 INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline",
95  "AMDGPU Function Integration/Inlining", false, false)
96 
97 Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); }
98 
99 bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) {
100  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
101  return LegacyInlinerBase::runOnSCC(SCC);
102 }
103 
104 void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const {
107 }
108 
109 unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
110  int Thres = Params.DefaultThreshold;
111 
112  Function *Caller = CS.getCaller();
113  // Listen to the inlinehint attribute when it would increase the threshold
114  // and the caller does not need to minimize its size.
116  bool InlineHint = Callee && !Callee->isDeclaration() &&
117  Callee->hasFnAttribute(Attribute::InlineHint);
118  if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres
119  && !Caller->hasFnAttribute(Attribute::MinSize))
120  Thres = Params.HintThreshold.getValue() *
121  TTIWP->getTTI(*Callee).getInliningThresholdMultiplier();
122 
123  const DataLayout &DL = Caller->getParent()->getDataLayout();
124  if (!Callee)
125  return (unsigned)Thres;
126 
127  // If we have a pointer to private array passed into a function
128  // it will not be optimized out, leaving scratch usage.
129  // Increase the inline threshold to allow inliniting in this case.
130  uint64_t AllocaSize = 0;
132  for (Value *PtrArg : CS.args()) {
133  PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
134  if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
136  continue;
137 
138  PtrArg = GetUnderlyingObject(PtrArg, DL);
139  if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
140  if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
141  continue;
142  AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
143  // If the amount of stack memory is excessive we will not be able
144  // to get rid of the scratch anyway, bail out.
145  if (AllocaSize > ArgAllocaCutoff) {
146  AllocaSize = 0;
147  break;
148  }
149  }
150  }
151  if (AllocaSize)
152  Thres += ArgAllocaCost;
153 
154  return (unsigned)Thres;
155 }
156 
157 // Check if call is just a wrapper around another call.
158 // In this case we only have call and ret instructions.
159 static bool isWrapperOnlyCall(CallSite CS) {
161  if (!Callee || Callee->size() != 1)
162  return false;
163  const BasicBlock &BB = Callee->getEntryBlock();
164  if (const Instruction *I = BB.getFirstNonPHI()) {
165  if (!isa<CallInst>(I)) {
166  return false;
167  }
168  if (isa<ReturnInst>(*std::next(I->getIterator()))) {
169  LLVM_DEBUG(dbgs() << " Wrapper only call detected: "
170  << Callee->getName() << '\n');
171  return true;
172  }
173  }
174  return false;
175 }
176 
179  Function *Caller = CS.getCaller();
180 
181  if (!Callee || Callee->isDeclaration())
182  return llvm::InlineCost::getNever("undefined callee");
183 
184  if (CS.isNoInline())
185  return llvm::InlineCost::getNever("noinline");
186 
187  TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
188  if (!TTI.areInlineCompatible(Caller, Callee))
189  return llvm::InlineCost::getNever("incompatible");
190 
191  if (CS.hasFnAttr(Attribute::AlwaysInline)) {
192  auto IsViable = isInlineViable(*Callee);
193  if (IsViable)
194  return llvm::InlineCost::getAlways("alwaysinline viable");
195  return llvm::InlineCost::getNever(IsViable.message);
196  }
197 
198  if (isWrapperOnlyCall(CS))
199  return llvm::InlineCost::getAlways("wrapper-only call");
200 
201  InlineParams LocalParams = Params;
202  LocalParams.DefaultThreshold = (int)getInlineThreshold(CS);
203  bool RemarksEnabled = false;
204  const auto &BBs = Caller->getBasicBlockList();
205  if (!BBs.empty()) {
206  auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front());
207  if (DI.isEnabled())
208  RemarksEnabled = true;
209  }
210 
211  OptimizationRemarkEmitter ORE(Caller);
212  std::function<AssumptionCache &(Function &)> GetAssumptionCache =
213  [this](Function &F) -> AssumptionCache & {
214  return ACT->getAssumptionCache(F);
215  };
216 
217  auto IC = llvm::getInlineCost(cast<CallBase>(*CS.getInstruction()), Callee,
218  LocalParams, TTI, GetAssumptionCache, None, PSI,
219  RemarksEnabled ? &ORE : nullptr);
220 
221  if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) {
222  // Single BB does not increase total BB amount, thus subtract 1
223  size_t Size = Caller->size() + Callee->size() - 1;
224  if (MaxBB && Size > MaxBB)
225  return llvm::InlineCost::getNever("max number of bb exceeded");
226  }
227  return IC;
228 }
size_t size() const
Definition: Function.h:685
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:80
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:110
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:153
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function< AssumptionCache &(Function &)> &GetAssumptionCache, Optional< function_ref< BlockFrequencyInfo &(Function &)>> GetBFI, ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
This class represents lattice values for constants.
Definition: AllocatorList.h:23
#define DEBUG_TYPE
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:323
A debug info location.
Definition: DebugLoc.h:33
F(f)
FunTy * getCalledFunction() const
Return the function being called if this is a direct call, otherwise return null (if it&#39;s an indirect...
Definition: CallSite.h:111
static bool isWrapperOnlyCall(CallSite CS)
Represents the cost of inlining a function.
Definition: InlineCost.h:63
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
bool runOnSCC(CallGraphSCC &SCC) override
Main run interface method, this implements the interface required by the Pass class.
Definition: Inliner.cpp:501
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Pass * createAMDGPUFunctionInliningPass()
This class contains all of the helper code which is used to perform the inlining operations that do n...
Definition: Inliner.h:30
amdgpu AMDGPU Function Integration Inlining
void initializeAMDGPUInlinerPass(PassRegistry &)
InstrTy * getInstruction() const
Definition: CallSite.h:96
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
static cl::opt< int > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(1500), cl::desc("Cost of alloca argument"))
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static InlineCost getAlways(const char *Reason)
Definition: InlineCost.h:91
Class to represent pointers.
Definition: DerivedTypes.h:544
const BasicBlock & getEntryBlock() const
Definition: Function.h:664
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
void getAnalysisUsage(AnalysisUsage &Info) const override
For this class, we declare that we require and preserve the call graph.
Definition: Inliner.cpp:131
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:189
Wrapper pass for TargetTransformInfo.
The ModulePass which wraps up a CallGraph and the logic to build it.
Definition: CallGraph.h:324
bool isNoInline() const
Return true if the call should not be inlined.
Definition: CallSite.h:451
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
amdgpu inline
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
Diagnostic information for applied optimization remarks.
Represent the analysis usage information of a pass.
InlineResult isInlineViable(Function &Callee)
Minimal filter to detect invalid constructs for inlining.
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:572
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
iterator_range< IterTy > args() const
Definition: CallSite.h:222
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options...
Module.h This file contains the declarations for the Module class.
Address space for private memory.
Definition: AMDGPU.h:271
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if this function has the given attribute.
Definition: CallSite.h:370
amdgpu Simplify well known AMD library false FunctionCallee Callee
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:469
static InlineCost getNever(const char *Reason)
Definition: InlineCost.h:94
Address space for flat memory.
Definition: AMDGPU.h:265
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
#define I(x, y, z)
Definition: MD5.cpp:58
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
const BasicBlockListType & getBasicBlockList() const
Get the underlying elements of the Function...
Definition: Function.h:657
uint32_t Size
Definition: Profile.cpp:46
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:227
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline", "AMDGPU Function Integration/Inlining", false, false) INITIALIZE_PASS_END(AMDGPUInliner
LLVM Value Representation.
Definition: Value.h:72
CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
static cl::opt< size_t > MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(300), cl::desc("Maximum BB number allowed in a function after inlining" " (compile time constraint)"))
This pass exposes codegen information to IR-level passes.
#define LLVM_DEBUG(X)
Definition: Debug.h:122
The optimization diagnostic interface.
int DefaultThreshold
The default threshold to start with for a callee.
Definition: InlineCost.h:155
an instruction to allocate memory on the stack
Definition: Instructions.h:59
FunTy * getCaller() const
Return the caller function for this call site.
Definition: CallSite.h:275