Line data Source code
1 : //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // This file implements inline cost analysis.
11 : //
12 : //===----------------------------------------------------------------------===//
13 :
14 : #include "llvm/Analysis/InlineCost.h"
15 : #include "llvm/ADT/STLExtras.h"
16 : #include "llvm/ADT/SetVector.h"
17 : #include "llvm/ADT/SmallPtrSet.h"
18 : #include "llvm/ADT/SmallVector.h"
19 : #include "llvm/ADT/Statistic.h"
20 : #include "llvm/Analysis/AssumptionCache.h"
21 : #include "llvm/Analysis/BlockFrequencyInfo.h"
22 : #include "llvm/Analysis/CodeMetrics.h"
23 : #include "llvm/Analysis/ConstantFolding.h"
24 : #include "llvm/Analysis/CFG.h"
25 : #include "llvm/Analysis/InstructionSimplify.h"
26 : #include "llvm/Analysis/ProfileSummaryInfo.h"
27 : #include "llvm/Analysis/TargetTransformInfo.h"
28 : #include "llvm/Analysis/ValueTracking.h"
29 : #include "llvm/Config/llvm-config.h"
30 : #include "llvm/IR/CallSite.h"
31 : #include "llvm/IR/CallingConv.h"
32 : #include "llvm/IR/DataLayout.h"
33 : #include "llvm/IR/GetElementPtrTypeIterator.h"
34 : #include "llvm/IR/GlobalAlias.h"
35 : #include "llvm/IR/InstVisitor.h"
36 : #include "llvm/IR/IntrinsicInst.h"
37 : #include "llvm/IR/Operator.h"
38 : #include "llvm/Support/Debug.h"
39 : #include "llvm/Support/raw_ostream.h"
40 :
41 : using namespace llvm;
42 :
43 : #define DEBUG_TYPE "inline-cost"
44 :
45 : STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
46 :
47 : static cl::opt<int> InlineThreshold(
48 : "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
49 : cl::desc("Control the amount of inlining to perform (default = 225)"));
50 :
51 : static cl::opt<int> HintThreshold(
52 : "inlinehint-threshold", cl::Hidden, cl::init(325),
53 : cl::desc("Threshold for inlining functions with inline hint"));
54 :
55 : static cl::opt<int>
56 : ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
57 : cl::init(45),
58 : cl::desc("Threshold for inlining cold callsites"));
59 :
60 : // We introduce this threshold to help performance of instrumentation based
61 : // PGO before we actually hook up inliner with analysis passes such as BPI and
62 : // BFI.
63 : static cl::opt<int> ColdThreshold(
64 : "inlinecold-threshold", cl::Hidden, cl::init(45),
65 : cl::desc("Threshold for inlining functions with cold attribute"));
66 :
67 : static cl::opt<int>
68 : HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
69 : cl::ZeroOrMore,
70 : cl::desc("Threshold for hot callsites "));
71 :
72 : static cl::opt<int> LocallyHotCallSiteThreshold(
73 : "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
74 : cl::desc("Threshold for locally hot callsites "));
75 :
76 : static cl::opt<int> ColdCallSiteRelFreq(
77 : "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
78 : cl::desc("Maxmimum block frequency, expressed as a percentage of caller's "
79 : "entry frequency, for a callsite to be cold in the absence of "
80 : "profile information."));
81 :
82 : static cl::opt<int> HotCallSiteRelFreq(
83 : "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
84 : cl::desc("Minimum block frequency, expressed as a multiple of caller's "
85 : "entry frequency, for a callsite to be hot in the absence of "
86 : "profile information."));
87 :
88 : static cl::opt<bool> OptComputeFullInlineCost(
89 : "inline-cost-full", cl::Hidden, cl::init(false),
90 : cl::desc("Compute the full inline cost of a call site even when the cost "
91 : "exceeds the threshold."));
92 :
93 : namespace {
94 :
95 : class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
96 : typedef InstVisitor<CallAnalyzer, bool> Base;
97 : friend class InstVisitor<CallAnalyzer, bool>;
98 :
99 : /// The TargetTransformInfo available for this compilation.
100 : const TargetTransformInfo &TTI;
101 :
102 : /// Getter for the cache of @llvm.assume intrinsics.
103 : std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
104 :
105 : /// Getter for BlockFrequencyInfo
106 : Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
107 :
108 : /// Profile summary information.
109 : ProfileSummaryInfo *PSI;
110 :
111 : /// The called function.
112 : Function &F;
113 :
114 : // Cache the DataLayout since we use it a lot.
115 : const DataLayout &DL;
116 :
117 : /// The OptimizationRemarkEmitter available for this compilation.
118 : OptimizationRemarkEmitter *ORE;
119 :
120 : /// The candidate callsite being analyzed. Please do not use this to do
121 : /// analysis in the caller function; we want the inline cost query to be
122 : /// easily cacheable. Instead, use the cover function paramHasAttr.
123 : CallSite CandidateCS;
124 :
125 : /// Tunable parameters that control the analysis.
126 : const InlineParams &Params;
127 :
128 : int Threshold;
129 : int Cost;
130 : bool ComputeFullInlineCost;
131 :
132 : bool IsCallerRecursive;
133 : bool IsRecursiveCall;
134 : bool ExposesReturnsTwice;
135 : bool HasDynamicAlloca;
136 : bool ContainsNoDuplicateCall;
137 : bool HasReturn;
138 : bool HasIndirectBr;
139 : bool HasUninlineableIntrinsic;
140 : bool InitsVargArgs;
141 :
142 : /// Number of bytes allocated statically by the callee.
143 : uint64_t AllocatedSize;
144 : unsigned NumInstructions, NumVectorInstructions;
145 : int VectorBonus, TenPercentVectorBonus;
146 : // Bonus to be applied when the callee has only one reachable basic block.
147 : int SingleBBBonus;
148 :
149 : /// While we walk the potentially-inlined instructions, we build up and
150 : /// maintain a mapping of simplified values specific to this callsite. The
151 : /// idea is to propagate any special information we have about arguments to
152 : /// this call through the inlinable section of the function, and account for
153 : /// likely simplifications post-inlining. The most important aspect we track
154 : /// is CFG altering simplifications -- when we prove a basic block dead, that
155 : /// can cause dramatic shifts in the cost of inlining a function.
156 : DenseMap<Value *, Constant *> SimplifiedValues;
157 :
158 : /// Keep track of the values which map back (through function arguments) to
159 : /// allocas on the caller stack which could be simplified through SROA.
160 : DenseMap<Value *, Value *> SROAArgValues;
161 :
162 : /// The mapping of caller Alloca values to their accumulated cost savings. If
163 : /// we have to disable SROA for one of the allocas, this tells us how much
164 : /// cost must be added.
165 : DenseMap<Value *, int> SROAArgCosts;
166 :
167 : /// Keep track of values which map to a pointer base and constant offset.
168 : DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
169 :
170 : /// Keep track of dead blocks due to the constant arguments.
171 : SetVector<BasicBlock *> DeadBlocks;
172 :
173 : /// The mapping of the blocks to their known unique successors due to the
174 : /// constant arguments.
175 : DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
176 :
177 : /// Model the elimination of repeated loads that is expected to happen
178 : /// whenever we simplify away the stores that would otherwise cause them to be
179 : /// loads.
180 : bool EnableLoadElimination;
181 : SmallPtrSet<Value *, 16> LoadAddrSet;
182 : int LoadEliminationCost;
183 :
184 : // Custom simplification helper routines.
185 : bool isAllocaDerivedArg(Value *V);
186 : bool lookupSROAArgAndCost(Value *V, Value *&Arg,
187 : DenseMap<Value *, int>::iterator &CostIt);
188 : void disableSROA(DenseMap<Value *, int>::iterator CostIt);
189 : void disableSROA(Value *V);
190 : void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
191 : void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
192 : int InstructionCost);
193 : void disableLoadElimination();
194 : bool isGEPFree(GetElementPtrInst &GEP);
195 : bool canFoldInboundsGEP(GetElementPtrInst &I);
196 : bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
197 : bool simplifyCallSite(Function *F, CallSite CS);
198 : template <typename Callable>
199 : bool simplifyInstruction(Instruction &I, Callable Evaluate);
200 : ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
201 :
202 : /// Return true if the given argument to the function being considered for
203 : /// inlining has the given attribute set either at the call site or the
204 : /// function declaration. Primarily used to inspect call site specific
205 : /// attributes since these can be more precise than the ones on the callee
206 : /// itself.
207 : bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
208 :
209 : /// Return true if the given value is known non null within the callee if
210 : /// inlined through this particular callsite.
211 : bool isKnownNonNullInCallee(Value *V);
212 :
213 : /// Update Threshold based on callsite properties such as callee
214 : /// attributes and callee hotness for PGO builds. The Callee is explicitly
215 : /// passed to support analyzing indirect calls whose target is inferred by
216 : /// analysis.
217 : void updateThreshold(CallSite CS, Function &Callee);
218 :
219 : /// Return true if size growth is allowed when inlining the callee at CS.
220 : bool allowSizeGrowth(CallSite CS);
221 :
222 : /// Return true if \p CS is a cold callsite.
223 : bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI);
224 :
225 : /// Return a higher threshold if \p CS is a hot callsite.
226 : Optional<int> getHotCallSiteThreshold(CallSite CS,
227 : BlockFrequencyInfo *CallerBFI);
228 :
229 : // Custom analysis routines.
230 : InlineResult analyzeBlock(BasicBlock *BB,
231 : SmallPtrSetImpl<const Value *> &EphValues);
232 :
233 : // Disable several entry points to the visitor so we don't accidentally use
234 : // them by declaring but not defining them here.
235 : void visit(Module *);
236 : void visit(Module &);
237 : void visit(Function *);
238 : void visit(Function &);
239 : void visit(BasicBlock *);
240 : void visit(BasicBlock &);
241 :
242 : // Provide base case for our instruction visit.
243 : bool visitInstruction(Instruction &I);
244 :
245 : // Our visit overrides.
246 : bool visitAlloca(AllocaInst &I);
247 : bool visitPHI(PHINode &I);
248 : bool visitGetElementPtr(GetElementPtrInst &I);
249 : bool visitBitCast(BitCastInst &I);
250 : bool visitPtrToInt(PtrToIntInst &I);
251 : bool visitIntToPtr(IntToPtrInst &I);
252 : bool visitCastInst(CastInst &I);
253 : bool visitUnaryInstruction(UnaryInstruction &I);
254 : bool visitCmpInst(CmpInst &I);
255 : bool visitSub(BinaryOperator &I);
256 : bool visitBinaryOperator(BinaryOperator &I);
257 : bool visitLoad(LoadInst &I);
258 : bool visitStore(StoreInst &I);
259 : bool visitExtractValue(ExtractValueInst &I);
260 : bool visitInsertValue(InsertValueInst &I);
261 : bool visitCallSite(CallSite CS);
262 : bool visitReturnInst(ReturnInst &RI);
263 : bool visitBranchInst(BranchInst &BI);
264 : bool visitSelectInst(SelectInst &SI);
265 : bool visitSwitchInst(SwitchInst &SI);
266 : bool visitIndirectBrInst(IndirectBrInst &IBI);
267 : bool visitResumeInst(ResumeInst &RI);
268 : bool visitCleanupReturnInst(CleanupReturnInst &RI);
269 : bool visitCatchReturnInst(CatchReturnInst &RI);
270 : bool visitUnreachableInst(UnreachableInst &I);
271 :
272 : public:
273 306857 : CallAnalyzer(const TargetTransformInfo &TTI,
274 : std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
275 : Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
276 : ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
277 : Function &Callee, CallSite CSArg, const InlineParams &Params)
278 306857 : : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
279 306857 : PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
280 306857 : CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
281 306857 : Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
282 306857 : Params.ComputeFullInlineCost || ORE),
283 : IsCallerRecursive(false), IsRecursiveCall(false),
284 : ExposesReturnsTwice(false), HasDynamicAlloca(false),
285 : ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
286 : HasUninlineableIntrinsic(false), InitsVargArgs(false), AllocatedSize(0),
287 : NumInstructions(0), NumVectorInstructions(0), VectorBonus(0),
288 : SingleBBBonus(0), EnableLoadElimination(true), LoadEliminationCost(0),
289 : NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
290 : NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
291 : NumInstructionsSimplified(0), SROACostSavings(0),
292 920571 : SROACostSavingsLost(0) {}
293 :
294 : InlineResult analyzeCall(CallSite CS);
295 :
296 0 : int getThreshold() { return Threshold; }
297 0 : int getCost() { return Cost; }
298 :
299 : // Keep a bunch of stats about the cost savings found so we can print them
300 : // out when debugging.
301 : unsigned NumConstantArgs;
302 : unsigned NumConstantOffsetPtrArgs;
303 : unsigned NumAllocaArgs;
304 : unsigned NumConstantPtrCmps;
305 : unsigned NumConstantPtrDiffs;
306 : unsigned NumInstructionsSimplified;
307 : unsigned SROACostSavings;
308 : unsigned SROACostSavingsLost;
309 :
310 : void dump();
311 : };
312 :
313 : } // namespace
314 :
315 : /// Test whether the given value is an Alloca-derived function argument.
316 : bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
317 141711 : return SROAArgValues.count(V);
318 : }
319 :
320 : /// Lookup the SROA-candidate argument and cost iterator which V maps to.
321 : /// Returns false if V does not map to a SROA-candidate.
322 21291992 : bool CallAnalyzer::lookupSROAArgAndCost(
323 : Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
324 21291992 : if (SROAArgValues.empty() || SROAArgCosts.empty())
325 : return false;
326 :
327 8472557 : DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
328 8472557 : if (ArgIt == SROAArgValues.end())
329 : return false;
330 :
331 523755 : Arg = ArgIt->second;
332 523755 : CostIt = SROAArgCosts.find(Arg);
333 523755 : return CostIt != SROAArgCosts.end();
334 : }
335 :
336 : /// Disable SROA for the candidate marked by this cost iterator.
337 : ///
338 : /// This marks the candidate as no longer viable for SROA, and adds the cost
339 : /// savings associated with it back into the inline cost measurement.
340 0 : void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
341 : // If we're no longer able to perform SROA we need to undo its cost savings
342 : // and prevent subsequent analysis.
343 109934 : Cost += CostIt->second;
344 109934 : SROACostSavings -= CostIt->second;
345 0 : SROACostSavingsLost += CostIt->second;
346 : SROAArgCosts.erase(CostIt);
347 : disableLoadElimination();
348 0 : }
349 :
350 : /// If 'V' maps to a SROA candidate, disable SROA for it.
351 12184919 : void CallAnalyzer::disableSROA(Value *V) {
352 : Value *SROAArg;
353 12184919 : DenseMap<Value *, int>::iterator CostIt;
354 12184919 : if (lookupSROAArgAndCost(V, SROAArg, CostIt))
355 105214 : disableSROA(CostIt);
356 12184919 : }
357 :
358 : /// Accumulate the given cost for a particular SROA candidate.
359 0 : void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
360 : int InstructionCost) {
361 184551 : CostIt->second += InstructionCost;
362 184551 : SROACostSavings += InstructionCost;
363 0 : }
364 :
365 : void CallAnalyzer::disableLoadElimination() {
366 4478158 : if (EnableLoadElimination) {
367 287144 : Cost += LoadEliminationCost;
368 287144 : LoadEliminationCost = 0;
369 287144 : EnableLoadElimination = false;
370 : }
371 : }
372 :
373 : /// Accumulate a constant GEP offset into an APInt if possible.
374 : ///
375 : /// Returns false if unable to compute the offset for any reason. Respects any
376 : /// simplified values known during the analysis of this callsite.
377 549392 : bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
378 549392 : unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
379 : assert(IntPtrWidth == Offset.getBitWidth());
380 :
381 1840712 : for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
382 3132032 : GTI != GTE; ++GTI) {
383 : ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
384 : if (!OpC)
385 72362 : if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
386 : OpC = dyn_cast<ConstantInt>(SimpleOp);
387 1324697 : if (!OpC)
388 33377 : return false;
389 1291320 : if (OpC->isZero())
390 1259562 : continue;
391 :
392 : // Handle a struct index, which adds its field offset to the pointer.
393 243387 : if (StructType *STy = GTI.getStructTypeOrNull()) {
394 243387 : unsigned ElementIdx = OpC->getZExtValue();
395 243387 : const StructLayout *SL = DL.getStructLayout(STy);
396 243387 : Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
397 243387 : continue;
398 : }
399 :
400 31758 : APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
401 63516 : Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
402 : }
403 516015 : return true;
404 : }
405 :
406 : /// Use TTI to check whether a GEP is free.
407 : ///
408 : /// Respects any simplified values known during the analysis of this callsite.
409 65189 : bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
410 : SmallVector<Value *, 4> Operands;
411 65189 : Operands.push_back(GEP.getOperand(0));
412 182329 : for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
413 234280 : if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
414 141 : Operands.push_back(SimpleOp);
415 : else
416 116999 : Operands.push_back(*I);
417 130378 : return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
418 : }
419 :
420 2598081 : bool CallAnalyzer::visitAlloca(AllocaInst &I) {
421 : // Check whether inlining will turn a dynamic alloca into a static
422 : // alloca and handle that case.
423 2598081 : if (I.isArrayAllocation()) {
424 100 : Constant *Size = SimplifiedValues.lookup(I.getArraySize());
425 46 : if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
426 41 : Type *Ty = I.getAllocatedType();
427 41 : AllocatedSize = SaturatingMultiplyAdd(
428 41 : AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
429 41 : return Base::visitAlloca(I);
430 : }
431 : }
432 :
433 : // Accumulate the allocated size.
434 2598040 : if (I.isStaticAlloca()) {
435 2598033 : Type *Ty = I.getAllocatedType();
436 5196066 : AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
437 : }
438 :
439 : // We will happily inline static alloca instructions.
440 2598040 : if (I.isStaticAlloca())
441 2598033 : return Base::visitAlloca(I);
442 :
443 : // FIXME: This is overly conservative. Dynamic allocas are inefficient for
444 : // a variety of reasons, and so we would like to not inline them into
445 : // functions which don't currently have a dynamic alloca. This simply
446 : // disables inlining altogether in the presence of a dynamic alloca.
447 7 : HasDynamicAlloca = true;
448 7 : return false;
449 : }
450 :
451 419260 : bool CallAnalyzer::visitPHI(PHINode &I) {
452 : // FIXME: We need to propagate SROA *disabling* through phi nodes, even
453 : // though we don't want to propagate it's bonuses. The idea is to disable
454 : // SROA if it *might* be used in an inappropriate manner.
455 :
456 : // Phi nodes are always zero-cost.
457 : // FIXME: Pointer sizes may differ between different address spaces, so do we
458 : // need to use correct address space in the call to getPointerSizeInBits here?
459 : // Or could we skip the getPointerSizeInBits call completely? As far as I can
460 : // see the ZeroOffset is used as a dummy value, so we can probably use any
461 : // bit width for the ZeroOffset?
462 419260 : APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
463 419260 : bool CheckSROA = I.getType()->isPointerTy();
464 :
465 : // Track the constant or pointer with constant offset we've seen so far.
466 : Constant *FirstC = nullptr;
467 : std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
468 : Value *FirstV = nullptr;
469 :
470 664358 : for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
471 658124 : BasicBlock *Pred = I.getIncomingBlock(i);
472 : // If the incoming block is dead, skip the incoming block.
473 658124 : if (DeadBlocks.count(Pred))
474 239325 : continue;
475 : // If the parent block of phi is not the known successor of the incoming
476 : // block, skip the incoming block.
477 651659 : BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
478 651659 : if (KnownSuccessor && KnownSuccessor != I.getParent())
479 : continue;
480 :
481 : Value *V = I.getIncomingValue(i);
482 : // If the incoming value is this phi itself, skip the incoming value.
483 650269 : if (&I == V)
484 : continue;
485 :
486 : Constant *C = dyn_cast<Constant>(V);
487 : if (!C)
488 435648 : C = SimplifiedValues.lookup(V);
489 :
490 : std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
491 650266 : if (!C && CheckSROA)
492 197608 : BaseAndOffset = ConstantOffsetPtrs.lookup(V);
493 :
494 650266 : if (!C && !BaseAndOffset.first)
495 : // The incoming value is neither a constant nor a pointer with constant
496 : // offset, exit early.
497 : return true;
498 :
499 440912 : if (FirstC) {
500 206949 : if (FirstC == C)
501 : // If we've seen a constant incoming value before and it is the same
502 : // constant we see this time, continue checking the next incoming value.
503 : continue;
504 : // Otherwise early exit because we either see a different constant or saw
505 : // a constant before but we have a pointer with constant offset this time.
506 : return true;
507 : }
508 :
509 233963 : if (FirstV) {
510 : // The same logic as above, but check pointer with constant offset here.
511 1961 : if (FirstBaseAndOffset == BaseAndOffset)
512 : continue;
513 : return true;
514 : }
515 :
516 232002 : if (C) {
517 : // This is the 1st time we've seen a constant, record it.
518 : FirstC = C;
519 : continue;
520 : }
521 :
522 : // The remaining case is that this is the 1st time we've seen a pointer with
523 : // constant offset, record it.
524 : FirstV = V;
525 : FirstBaseAndOffset = BaseAndOffset;
526 : }
527 :
528 : // Check if we can map phi to a constant.
529 6234 : if (FirstC) {
530 4683 : SimplifiedValues[&I] = FirstC;
531 4683 : return true;
532 : }
533 :
534 : // Check if we can map phi to a pointer with constant offset.
535 1551 : if (FirstBaseAndOffset.first) {
536 1551 : ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
537 :
538 : Value *SROAArg;
539 1551 : DenseMap<Value *, int>::iterator CostIt;
540 1551 : if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
541 196 : SROAArgValues[&I] = SROAArg;
542 : }
543 :
544 : return true;
545 : }
546 :
547 : /// Check we can fold GEPs of constant-offset call site argument pointers.
548 : /// This requires target data and inbounds GEPs.
549 : ///
550 : /// \return true if the specified GEP can be folded.
551 836528 : bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
552 : // Check if we have a base + offset for the pointer.
553 : std::pair<Value *, APInt> BaseAndOffset =
554 836528 : ConstantOffsetPtrs.lookup(I.getPointerOperand());
555 836528 : if (!BaseAndOffset.first)
556 : return false;
557 :
558 : // Check if the offset of this GEP is constant, and if so accumulate it
559 : // into Offset.
560 446965 : if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
561 : return false;
562 :
563 : // Add the result as a new mapping to Base + Offset.
564 418786 : ConstantOffsetPtrs[&I] = BaseAndOffset;
565 :
566 418786 : return true;
567 : }
568 :
569 845959 : bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
570 : Value *SROAArg;
571 845959 : DenseMap<Value *, int>::iterator CostIt;
572 : bool SROACandidate =
573 845959 : lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
574 :
575 : // Lambda to check whether a GEP's indices are all constant.
576 : auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
577 : for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
578 : if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
579 : return false;
580 : return true;
581 845959 : };
582 :
583 845959 : if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
584 780770 : if (SROACandidate)
585 166014 : SROAArgValues[&I] = SROAArg;
586 :
587 : // Constant GEPs are modeled as free.
588 780770 : return true;
589 : }
590 :
591 : // Variable GEPs will require math and will disable SROA.
592 65189 : if (SROACandidate)
593 4193 : disableSROA(CostIt);
594 65189 : return isGEPFree(I);
595 : }
596 :
597 : /// Simplify \p I if its operands are constants and update SimplifiedValues.
598 : /// \p Evaluate is a callable specific to instruction type that evaluates the
599 : /// instruction when all the operands are constants.
600 : template <typename Callable>
601 3702575 : bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
602 : SmallVector<Constant *, 2> COps;
603 10033167 : for (Value *Op : I.operands()) {
604 3720956 : Constant *COp = dyn_cast<Constant>(Op);
605 3720956 : if (!COp)
606 2210666 : COp = SimplifiedValues.lookup(Op);
607 3720956 : if (!COp)
608 1092939 : return false;
609 2628017 : COps.push_back(COp);
610 : }
611 2609636 : auto *C = Evaluate(COps);
612 2609636 : if (!C)
613 : return false;
614 11562 : SimplifiedValues[&I] = C;
615 11562 : return true;
616 : }
617 19732 :
618 : bool CallAnalyzer::visitBitCast(BitCastInst &I) {
619 49360 : // Propagate constants through bitcasts.
620 29628 : if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
621 29628 : return ConstantExpr::getBitCast(COps[0], I.getType());
622 39464 : }))
623 29628 : return true;
624 19732 :
625 9896 : // Track base/offsets through casts
626 : std::pair<Value *, APInt> BaseAndOffset =
627 0 : ConstantOffsetPtrs.lookup(I.getOperand(0));
628 0 : // Casts don't change the offset, just wrap it up.
629 : if (BaseAndOffset.first)
630 0 : ConstantOffsetPtrs[&I] = BaseAndOffset;
631 0 :
632 : // Also look for SROA candidates here.
633 153410 : Value *SROAArg;
634 : DenseMap<Value *, int>::iterator CostIt;
635 306822 : if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
636 153410 : SROAArgValues[&I] = SROAArg;
637 153410 :
638 306820 : // Bitcasts are always zero cost.
639 153410 : return true;
640 153408 : }
641 2 :
642 : bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
643 2 : // Propagate constants through ptrtoint.
644 2 : if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
645 : return ConstantExpr::getPtrToInt(COps[0], I.getType());
646 2 : }))
647 2 : return true;
648 :
649 339205 : // Track base/offset pairs when converted to a plain integer provided the
650 : // integer is large enough to represent the pointer.
651 694490 : unsigned IntegerSize = I.getType()->getScalarSizeInBits();
652 347690 : unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
653 347690 : if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
654 680040 : std::pair<Value *, APInt> BaseAndOffset =
655 347690 : ConstantOffsetPtrs.lookup(I.getOperand(0));
656 331610 : if (BaseAndOffset.first)
657 16080 : ConstantOffsetPtrs[&I] = BaseAndOffset;
658 : }
659 7595 :
660 7595 : // This is really weird. Technically, ptrtoint will disable SROA. However,
661 : // unless that ptrtoint is *used* somewhere in the live basic blocks after
662 7595 : // inlining, it will be nuked, and SROA should proceed. All of the uses which
663 7595 : // would block SROA would also block SROA if applied directly to a pointer,
664 : // and so we can just add the integer in here. The only places where SROA is
665 2598074 : // preserved either cannot fire on an integer, or won't in-and-of themselves
666 : // disable SROA (ext) w/o some later use that we would see and disable.
667 7794222 : Value *SROAArg;
668 2598074 : DenseMap<Value *, int>::iterator CostIt;
669 2598074 : if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
670 82 : SROAArgValues[&I] = SROAArg;
671 2598074 :
672 0 : return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
673 2598074 : }
674 :
675 2598074 : bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
676 2598074 : // Propagate constants through ptrtoint.
677 : if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
678 0 : return ConstantExpr::getIntToPtr(COps[0], I.getType());
679 0 : }))
680 : return true;
681 42558 :
682 : // Track base/offset pairs when round-tripped through a pointer without
683 87471 : // modifications provided the integer is not too large.
684 42558 : Value *Op = I.getOperand(0);
685 42558 : unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
686 85074 : if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
687 42558 : std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
688 40203 : if (BaseAndOffset.first)
689 2355 : ConstantOffsetPtrs[&I] = BaseAndOffset;
690 : }
691 2355 :
692 2355 : // "Propagate" SROA here in the same manner as we do for ptrtoint above.
693 : Value *SROAArg;
694 2355 : DenseMap<Value *, int>::iterator CostIt;
695 2355 : if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
696 : SROAArgValues[&I] = SROAArg;
697 11111 :
698 : return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
699 22311 : }
700 11111 :
701 11111 : bool CallAnalyzer::visitCastInst(CastInst &I) {
702 22222 : // Propagate constants through ptrtoint.
703 11111 : if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
704 11022 : return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
705 89 : }))
706 : return true;
707 89 :
708 89 : // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
709 : disableSROA(I.getOperand(0));
710 89 :
711 89 : // If this is a floating-point cast, and the target says this operation
712 : // is expensive, this may eventually become a library call. Treat the cost
713 45266 : // as such.
714 : switch (I.getOpcode()) {
715 90605 : case Instruction::FPTrunc:
716 45266 : case Instruction::FPExt:
717 45266 : case Instruction::UIToFP:
718 90532 : case Instruction::SIToFP:
719 45266 : case Instruction::FPToUI:
720 45193 : case Instruction::FPToSI:
721 73 : if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
722 : Cost += InlineConstants::CallPenalty;
723 73 : default:
724 73 : break;
725 : }
726 73 :
727 73 : return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
728 : }
729 493219 :
730 : bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
731 987886 : Value *Operand = I.getOperand(0);
732 493219 : if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
733 493219 : return ConstantFoldInstOperands(&I, COps[0], DL);
734 986432 : }))
735 493219 : return true;
736 491771 :
737 1448 : // Disable any SROA on the argument to arbitrary unary operators.
738 : disableSROA(Operand);
739 1448 :
740 1448 : return false;
741 : }
742 1448 :
743 1448 : bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
744 : return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
745 : }
746 493219 :
747 : bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
748 493219 : // Does the *call site* have the NonNull attribute set on an argument? We
749 2896 : // use the attribute on the call site to memoize any analysis done in the
750 : // caller. This will also trip if the callee function has a non-null
751 : // parameter attribute, but that's a less interesting case because hopefully
752 : // the callee would already have been simplified based on that.
753 : if (Argument *A = dyn_cast<Argument>(V))
754 : if (paramHasAttr(A, Attribute::NonNull))
755 491771 : return true;
756 :
757 491771 : // Is this an alloca in the caller? This is distinct from the attribute case
758 293920 : // above because attributes aren't updated within the inliner itself and we
759 : // always want to catch the alloca derived case.
760 : if (isAllocaDerivedArg(V))
761 : // We can actually predict the result of comparisons between an
762 491771 : // alloca-derived value and null. Note that this fires regardless of
763 491771 : // SROA firing.
764 43234 : return true;
765 :
766 : return false;
767 : }
768 :
769 : bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
770 45266 : // If the normal destination of the invoke or the parent block of the call
771 : // site is unreachable-terminated, there is little point in inlining this
772 45266 : // unless there is literally zero cost.
773 146 : // FIXME: Note that it is possible that an unreachable-terminated block has a
774 : // hot entry. For example, in below scenario inlining hot_call_X() may be
775 : // beneficial :
776 : // main() {
777 : // hot_call_1();
778 : // ...
779 45193 : // hot_call_N()
780 45193 : // exit(0);
781 45193 : // }
782 : // For now, we are not handling this corner case here as it is rare in real
783 45191 : // code. In future, we should elaborate this based on BPI and BFI in more
784 45191 : // general threshold adjusting heuristics in updateThreshold().
785 25654 : Instruction *Instr = CS.getInstruction();
786 : if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
787 : if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
788 : return false;
789 : } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
790 : return false;
791 :
792 : return true;
793 : }
794 :
795 : bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) {
796 45193 : // If global profile summary is available, then callsite's coldness is
797 45193 : // determined based on that.
798 463 : if (PSI && PSI->hasProfileSummary())
799 : return PSI->isColdCallSite(CS, CallerBFI);
800 45193 :
801 : // Otherwise we need BFI to be available.
802 : if (!CallerBFI)
803 11111 : return false;
804 :
805 11111 : // Determine if the callsite is cold relative to caller's entry. We could
806 178 : // potentially cache the computation of scaled entry frequency, but the added
807 : // complexity is not worth it unless this scaling shows up high in the
808 : // profiles.
809 : const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
810 : auto CallSiteBB = CS.getInstruction()->getParent();
811 : auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
812 : auto CallerEntryFreq =
813 11022 : CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock()));
814 11022 : return CallSiteFreq < CallerEntryFreq * ColdProb;
815 11020 : }
816 11020 :
817 0 : Optional<int>
818 : CallAnalyzer::getHotCallSiteThreshold(CallSite CS,
819 : BlockFrequencyInfo *CallerBFI) {
820 :
821 : // If global profile summary is available, then callsite's hotness is
822 11022 : // determined based on that.
823 11022 : if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI))
824 0 : return Params.HotCallSiteThreshold;
825 :
826 11022 : // Otherwise we need BFI to be available and to have a locally hot callsite
827 : // threshold.
828 : if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
829 42558 : return None;
830 :
831 42558 : // Determine if the callsite is hot relative to caller's entry. We could
832 7065 : // potentially cache the computation of scaled entry frequency, but the added
833 : // complexity is not worth it unless this scaling shows up high in the
834 : // profiles.
835 : auto CallSiteBB = CS.getInstruction()->getParent();
836 : auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
837 40203 : auto CallerEntryFreq = CallerBFI->getEntryFreq();
838 : if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
839 : return Params.LocallyHotCallSiteThreshold;
840 :
841 : // Otherwise treat it normally.
842 40203 : return None;
843 181 : }
844 :
845 : void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
846 : // If no size growth is allowed for this inlining, set Threshold to 0.
847 : if (!allowSizeGrowth(CS)) {
848 : Threshold = 0;
849 181 : return;
850 24 : }
851 :
852 : Function *Caller = CS.getCaller();
853 :
854 : // return min(A, B) if B is valid.
855 40203 : auto MinIfValid = [](int A, Optional<int> B) {
856 : return B ? std::min(A, B.getValue()) : A;
857 : };
858 2598074 :
859 : // return max(A, B) if B is valid.
860 2598074 : auto MaxIfValid = [](int A, Optional<int> B) {
861 5196148 : return B ? std::max(A, B.getValue()) : A;
862 : };
863 :
864 : // Various bonus percentages. These are multiplied by Threshold to get the
865 : // bonus values.
866 2598074 : // SingleBBBonus: This bonus is applied if the callee has a single reachable
867 : // basic block at the given callsite context. This is speculatively applied
868 2598074 : // and withdrawn if more than one basic block is seen.
869 : //
870 : // Vector bonuses: We want to more aggressively inline vector-dense kernels
871 : // and apply this bonus based on the percentage of vector instructions. A
872 5545 : // bonus is applied if the vector instructions exceed 50% and half that amount
873 : // is applied if it exceeds 10%. Note that these bonuses are some what
874 : // arbitrary and evolved over time by accident as much as because they are
875 71536 : // principled bonuses.
876 : // FIXME: It would be nice to base the bonus values on something more
877 : // scientific.
878 : //
879 : // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
880 : // of the last call to a static function as inlining such functions is
881 : // guaranteed to reduce code size.
882 5545 : //
883 : // These bonus percentages may be set to 0 based on properties of the caller
884 : // and the callsite.
885 : int SingleBBBonusPercent = 50;
886 : int VectorBonusPercent = 150;
887 : int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
888 :
889 : // Lambda to set all the above bonus and bonus percentages to 0.
890 : auto DisallowAllBonuses = [&]() {
891 : SingleBBBonusPercent = 0;
892 23 : VectorBonusPercent = 0;
893 : LastCallToStaticBonus = 0;
894 : };
895 :
896 : // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
897 0 : // and reduce the threshold if the caller has the necessary attribute.
898 : if (Caller->optForMinSize()) {
899 : Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
900 : // For minsize, we want to disable the single BB bonus and the vector
901 : // bonuses, but not the last-call-to-static bonus. Inlining the last call to
902 : // a static function will, at the minimum, eliminate the parameter setup and
903 : // call/return instructions.
904 : SingleBBBonusPercent = 0;
905 : VectorBonusPercent = 0;
906 : } else if (Caller->optForSize())
907 : Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
908 :
909 : // Adjust the threshold based on inlinehint attribute and profile based
910 : // hotness information if the caller does not have MinSize attribute.
911 : if (!Caller->optForMinSize()) {
912 : if (Callee.hasFnAttribute(Attribute::InlineHint))
913 : Threshold = MaxIfValid(Threshold, Params.HintThreshold);
914 :
915 0 : // FIXME: After switching to the new passmanager, simplify the logic below
916 0 : // by checking only the callsite hotness/coldness as we will reliably
917 0 : // have local profile information.
918 0 : //
919 : // Callsite hotness and coldness can be determined if sample profile is
920 : // used (which adds hotness metadata to calls) or if caller's
921 : // BlockFrequencyInfo is available.
922 : BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
923 0 : auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI);
924 : if (!Caller->optForSize() && HotCallSiteThreshold) {
925 : LLVM_DEBUG(dbgs() << "Hot callsite.\n");
926 0 : // FIXME: This should update the threshold only if it exceeds the
927 0 : // current threshold, but AutoFDO + ThinLTO currently relies on this
928 : // behavior to prevent inlining of hot callsites during ThinLTO
929 : // compile phase.
930 0 : Threshold = HotCallSiteThreshold.getValue();
931 0 : } else if (isColdCallSite(CS, CallerBFI)) {
932 : LLVM_DEBUG(dbgs() << "Cold callsite.\n");
933 : // Do not apply bonuses for a cold callsite including the
934 : // LastCallToStatic bonus. While this bonus might result in code size
935 : // reduction, it can cause the size of a non-cold caller to increase
936 : // preventing it from being inlined.
937 0 : DisallowAllBonuses();
938 0 : Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
939 0 : } else if (PSI) {
940 : // Use callee's global profile information only if we have no way of
941 0 : // determining this via callsite information.
942 0 : if (PSI->isFunctionEntryHot(&Callee)) {
943 : LLVM_DEBUG(dbgs() << "Hot callee.\n");
944 : // If callsite hotness can not be determined, we may still know
945 : // that the callee is hot and treat it as a weaker hint for threshold
946 0 : // increase.
947 : Threshold = MaxIfValid(Threshold, Params.HintThreshold);
948 : } else if (PSI->isFunctionEntryCold(&Callee)) {
949 : LLVM_DEBUG(dbgs() << "Cold callee.\n");
950 : // Do not apply bonuses for a cold callee including the
951 0 : // LastCallToStatic bonus. While this bonus might result in code size
952 0 : // reduction, it can cause the size of a non-cold caller to increase
953 : // preventing it from being inlined.
954 : DisallowAllBonuses();
955 : Threshold = MinIfValid(Threshold, Params.ColdThreshold);
956 0 : }
957 : }
958 : }
959 :
960 : // Finally, take the target-specific inlining threshold multiplier into
961 : // account.
962 : Threshold *= TTI.getInliningThresholdMultiplier();
963 0 :
964 0 : SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
965 0 : VectorBonus = Threshold * VectorBonusPercent / 100;
966 0 :
967 0 : bool OnlyOneCallAndLocalLinkage =
968 : F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
969 : // If there is only one call of the function, and it has internal linkage,
970 : // the cost of inlining it drops dramatically. It may seem odd to update
971 : // Cost in updateThreshold, but the bonus depends on the logic in this method.
972 : if (OnlyOneCallAndLocalLinkage)
973 306857 : Cost -= LastCallToStaticBonus;
974 : }
975 306857 :
976 18511 : bool CallAnalyzer::visitCmpInst(CmpInst &I) {
977 18511 : Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
978 : // First try to handle simplified comparisons.
979 : if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
980 : return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
981 : }))
982 : return true;
983 :
984 61 : if (I.getOpcode() == Instruction::FCmp)
985 : return false;
986 :
987 : // Otherwise look for a comparison between constant offset pointers with
988 : // a common base.
989 52152 : Value *LHSBase, *RHSBase;
990 : APInt LHSOffset, RHSOffset;
991 : std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
992 : if (LHSBase) {
993 : std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
994 : if (RHSBase && LHSBase == RHSBase) {
995 : // We have common bases, fold the icmp to a constant based on the
996 : // offsets.
997 : Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
998 : Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
999 : if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
1000 : SimplifiedValues[&I] = C;
1001 : ++NumConstantPtrCmps;
1002 : return true;
1003 : }
1004 : }
1005 : }
1006 :
1007 : // If the comparison is an equality comparison with null, we can simplify it
1008 : // if we know the value (argument) can't be null
1009 : if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1010 : isKnownNonNullInCallee(I.getOperand(0))) {
1011 : bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1012 : SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1013 288346 : : ConstantInt::getFalse(I.getType());
1014 288346 : return true;
1015 288346 : }
1016 : // Finally check for SROA candidates in comparisons.
1017 : Value *SROAArg;
1018 : DenseMap<Value *, int>::iterator CostIt;
1019 31 : if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
1020 31 : if (isa<ConstantPointerNull>(I.getOperand(1))) {
1021 31 : accumulateSROACost(CostIt, InlineConstants::InstrCost);
1022 : return true;
1023 : }
1024 :
1025 : disableSROA(CostIt);
1026 288346 : }
1027 10 :
1028 : return false;
1029 : }
1030 :
1031 : bool CallAnalyzer::visitSub(BinaryOperator &I) {
1032 5 : // Try to handle a special case: we can fold computing the difference of two
1033 5 : // constant-related pointers.
1034 288341 : Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1035 87 : Value *LHSBase, *RHSBase;
1036 : APInt LHSOffset, RHSOffset;
1037 : std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1038 : if (LHSBase) {
1039 288346 : std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1040 288341 : if (RHSBase && LHSBase == RHSBase) {
1041 156450 : // We have common bases, fold the subtract to a constant based on the
1042 : // offsets.
1043 : Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1044 : Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1045 : if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1046 : SimplifiedValues[&I] = C;
1047 : ++NumConstantPtrDiffs;
1048 : return true;
1049 : }
1050 288341 : }
1051 288341 : }
1052 288341 :
1053 : // Otherwise, fall back to the generic logic for simplifying and handling
1054 : // instructions.
1055 : return Base::visitSub(I);
1056 : }
1057 :
1058 12 : bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1059 288329 : Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1060 : Constant *CLHS = dyn_cast<Constant>(LHS);
1061 : if (!CLHS)
1062 : CLHS = SimplifiedValues.lookup(LHS);
1063 : Constant *CRHS = dyn_cast<Constant>(RHS);
1064 : if (!CRHS)
1065 : CRHS = SimplifiedValues.lookup(RHS);
1066 54 :
1067 288311 : Value *SimpleV = nullptr;
1068 : if (auto FI = dyn_cast<FPMathOperator>(&I))
1069 : SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS,
1070 287877 : CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL);
1071 : else
1072 : SimpleV =
1073 : SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1074 :
1075 6 : if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1076 287875 : SimplifiedValues[&I] = C;
1077 :
1078 : if (SimpleV)
1079 : return true;
1080 :
1081 : // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1082 : disableSROA(LHS);
1083 31 : disableSROA(RHS);
1084 :
1085 : // If the instruction is floating point, and the target says this operation
1086 : // is expensive, this may eventually become a library call. Treat the cost
1087 : // as such.
1088 : if (I.getType()->isFloatingPointTy() &&
1089 : TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1090 288346 : Cost += InlineConstants::CallPenalty;
1091 :
1092 288346 : return false;
1093 288346 : }
1094 :
1095 : bool CallAnalyzer::visitLoad(LoadInst &I) {
1096 333509 : Value *SROAArg;
1097 : DenseMap<Value *, int>::iterator CostIt;
1098 : if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1099 : if (I.isSimple()) {
1100 : accumulateSROACost(CostIt, InlineConstants::InstrCost);
1101 6837 : return true;
1102 : }
1103 :
1104 339205 : disableSROA(CostIt);
1105 : }
1106 :
1107 339205 : // If the data is already loaded from this address and hasn't been clobbered
1108 15190 : // by any stores or calls, this load is likely to be redundant and can be
1109 : // eliminated.
1110 : if (EnableLoadElimination &&
1111 : !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1112 331610 : LoadEliminationCost += InlineConstants::InstrCost;
1113 : return true;
1114 : }
1115 :
1116 : return false;
1117 : }
1118 :
1119 330783 : bool CallAnalyzer::visitStore(StoreInst &I) {
1120 330783 : Value *SROAArg;
1121 9799 : DenseMap<Value *, int>::iterator CostIt;
1122 9799 : if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1123 : if (I.isSimple()) {
1124 : accumulateSROACost(CostIt, InlineConstants::InstrCost);
1125 129 : return true;
1126 129 : }
1127 129 :
1128 129 : disableSROA(CostIt);
1129 129 : }
1130 129 :
1131 : // The store can potentially clobber loads and prevent repeated loads from
1132 : // being eliminated.
1133 : // FIXME:
1134 : // 1. We can probably keep an initial set of eliminatable loads substracted
1135 : // from the cost even when we finally see a store. We just need to disable
1136 : // *further* accumulation of elimination savings.
1137 655072 : // 2. We should probably at some point thread MemorySSA for the callee into
1138 71536 : // this and then use that to actually compute *really* precise savings.
1139 : disableLoadElimination();
1140 1365 : return false;
1141 673 : }
1142 692 :
1143 : bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1144 : // Constant folding for extract value is trivial.
1145 : if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1146 329962 : return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1147 329962 : }))
1148 252 : return true;
1149 0 :
1150 0 : // SROA can look through these but give them a cost.
1151 : return false;
1152 : }
1153 252 :
1154 : bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1155 : // Constant folding for insert value is trivial.
1156 : if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1157 : return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1158 : /*InsertedValueOperand*/ COps[1],
1159 50485 : I.getIndices());
1160 : }))
1161 : return true;
1162 :
1163 : // SROA can look through these but give them a cost.
1164 : return false;
1165 50485 : }
1166 50485 :
1167 5834 : /// Try to simplify a call site.
1168 5834 : ///
1169 : /// Takes a concrete function and callsite and tries to actually simplify it by
1170 : /// analyzing the arguments and call itself with instsimplify. Returns true if
1171 62 : /// it has simplified the callsite to some other entity (a constant), making it
1172 62 : /// free.
1173 62 : bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
1174 62 : // FIXME: Using the instsimplify logic directly for this is inefficient
1175 62 : // because we have to continually rebuild the argument list even when no
1176 62 : // simplifications can be performed. Until that is fixed with remapping
1177 : // inside of instsimplify, directly constant fold calls here.
1178 : if (!canConstantFoldCallTo(CS, F))
1179 : return false;
1180 :
1181 : // Try to re-map the arguments to constants.
1182 : SmallVector<Constant *, 4> ConstantArgs;
1183 50423 : ConstantArgs.reserve(CS.arg_size());
1184 : for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
1185 : ++I) {
1186 3461622 : Constant *C = dyn_cast<Constant>(*I);
1187 : if (!C)
1188 : C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
1189 : if (!C)
1190 6902186 : return false; // This argument doesn't map to a constant.
1191 :
1192 : ConstantArgs.push_back(C);
1193 202098 : }
1194 : if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) {
1195 : SimplifiedValues[CS.getInstruction()] = C;
1196 : return true;
1197 1432 : }
1198 :
1199 : return false;
1200 : }
1201 10462080 :
1202 : bool CallAnalyzer::visitCallSite(CallSite CS) {
1203 : if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
1204 7926 : !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1205 : // This aborts the entire analysis.
1206 3461622 : ExposesReturnsTwice = true;
1207 : return false;
1208 : }
1209 : if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
1210 3452929 : ContainsNoDuplicateCall = true;
1211 3452929 :
1212 : if (Function *F = CS.getCalledFunction()) {
1213 : // When we have a concrete function, first try to simplify it directly.
1214 : if (simplifyCallSite(F, CS))
1215 : return true;
1216 3453295 :
1217 366 : // Next check if it is an intrinsic we know about.
1218 36 : // FIXME: Lift this into part of the InstVisitor.
1219 : if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
1220 : switch (II->getIntrinsicID()) {
1221 : default:
1222 : if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1223 3767345 : disableLoadElimination();
1224 : return Base::visitCallSite(CS);
1225 3767345 :
1226 3767345 : case Intrinsic::load_relative:
1227 : // This is normally lowered to 4 LLVM instructions.
1228 138518 : Cost += 3 * InlineConstants::InstrCost;
1229 138518 : return false;
1230 :
1231 : case Intrinsic::memset:
1232 216 : case Intrinsic::memcpy:
1233 : case Intrinsic::memmove:
1234 : disableLoadElimination();
1235 : // SROA can usually chew through these intrinsics, but they aren't free.
1236 : return false;
1237 : case Intrinsic::icall_branch_funnel:
1238 3913045 : case Intrinsic::localescape:
1239 3628971 : HasUninlineableIntrinsic = true;
1240 663 : return false;
1241 663 : case Intrinsic::vastart:
1242 : InitsVargArgs = true;
1243 : return false;
1244 : }
1245 : }
1246 :
1247 3614266 : if (F == CS.getInstruction()->getFunction()) {
1248 : // This flag will fully abort the analysis, so don't bother with anything
1249 3614266 : // else.
1250 3614266 : IsRecursiveCall = true;
1251 : return false;
1252 46033 : }
1253 46033 :
1254 : if (TTI.isLoweredToCall(F)) {
1255 : // We account for the average 1 instruction per call argument setup
1256 59 : // here.
1257 : Cost += CS.arg_size() * InlineConstants::InstrCost;
1258 :
1259 : // Everything other than inline ASM will also have a significant cost
1260 : // merely from making the call.
1261 : if (!isa<InlineAsm>(CS.getCalledValue()))
1262 : Cost += InlineConstants::CallPenalty;
1263 : }
1264 :
1265 : if (!CS.onlyReadsMemory())
1266 : disableLoadElimination();
1267 : return Base::visitCallSite(CS);
1268 : }
1269 :
1270 : // Otherwise we're in a very special case -- an indirect function call. See
1271 : // if we can be particularly clever about this.
1272 : Value *Callee = CS.getCalledValue();
1273 153410 :
1274 2 : // First, pay the price of the argument setup. We account for the average
1275 : // 1 instruction per call argument setup here.
1276 : Cost += CS.arg_size() * InlineConstants::InstrCost;
1277 :
1278 : // Next, check if this happens to be an indirect function call to a known
1279 : // function in this inline context. If not, we've done all we can.
1280 : Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1281 : if (!F) {
1282 : if (!CS.onlyReadsMemory())
1283 : disableLoadElimination();
1284 19732 : return Base::visitCallSite(CS);
1285 : }
1286 :
1287 : // If we have a constant that we are calling as a function, we can peer
1288 : // through it and see the function target. This happens not infrequently
1289 : // during devirtualization and so we want to give it a hefty bonus for
1290 : // inlining, but cap that bonus in the event that inlining wouldn't pan
1291 : // out. Pretend to inline the function, with a custom threshold.
1292 : auto IndirectCallParams = Params;
1293 : IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
1294 : CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS,
1295 : IndirectCallParams);
1296 : if (CA.analyzeCall(CS)) {
1297 : // We were able to inline the indirect call! Subtract the cost from the
1298 : // threshold to get the bonus we want to apply, but don't go below zero.
1299 : Cost -= std::max(0, CA.getThreshold() - CA.getCost());
1300 : }
1301 1149947 :
1302 : if (!F->onlyReadsMemory())
1303 : disableLoadElimination();
1304 : return Base::visitCallSite(CS);
1305 : }
1306 1149947 :
1307 : bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1308 : // At least one return instruction will be free after inlining.
1309 : bool Free = !HasReturn;
1310 : HasReturn = true;
1311 969 : return Free;
1312 1029 : }
1313 :
1314 1008 : bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1315 1008 : // We model unconditional branches as essentially free -- they really
1316 1942 : // shouldn't exist at all, but handling them makes the behavior of the
1317 1008 : // inliner more regular and predictable. Interestingly, conditional branches
1318 948 : // which will fold away are also free.
1319 : return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1320 60 : dyn_cast_or_null<ConstantInt>(
1321 : SimplifiedValues.lookup(BI.getCondition()));
1322 21 : }
1323 21 :
1324 21 : bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1325 : bool CheckSROA = SI.getType()->isPointerTy();
1326 : Value *TrueVal = SI.getTrueValue();
1327 : Value *FalseVal = SI.getFalseValue();
1328 :
1329 : Constant *TrueC = dyn_cast<Constant>(TrueVal);
1330 1170240 : if (!TrueC)
1331 1170248 : TrueC = SimplifiedValues.lookup(TrueVal);
1332 8 : Constant *FalseC = dyn_cast<Constant>(FalseVal);
1333 : if (!FalseC)
1334 4 : FalseC = SimplifiedValues.lookup(FalseVal);
1335 4 : Constant *CondC =
1336 : dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1337 1170236 :
1338 11 : if (!CondC) {
1339 : // Select C, X, X => X
1340 : if (TrueC == FalseC && TrueC) {
1341 : SimplifiedValues[&SI] = TrueC;
1342 1149947 : return true;
1343 : }
1344 :
1345 : if (!CheckSROA)
1346 : return Base::visitSelectInst(SI);
1347 :
1348 356782 : std::pair<Value *, APInt> TrueBaseAndOffset =
1349 332194 : ConstantOffsetPtrs.lookup(TrueVal);
1350 332194 : std::pair<Value *, APInt> FalseBaseAndOffset =
1351 : ConstantOffsetPtrs.lookup(FalseVal);
1352 332194 : if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1353 : ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1354 0 :
1355 : Value *SROAArg;
1356 0 : DenseMap<Value *, int>::iterator CostIt;
1357 0 : if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
1358 : SROAArgValues[&SI] = SROAArg;
1359 : return true;
1360 : }
1361 :
1362 : return Base::visitSelectInst(SI);
1363 : }
1364 23920 :
1365 7 : // Select condition is a constant.
1366 : Value *SelectedV = CondC->isAllOnesValue()
1367 7 : ? TrueVal
1368 7 : : (CondC->isNullValue()) ? FalseVal : nullptr;
1369 661 : if (!SelectedV) {
1370 661 : // Condition is a vector constant that is not all 1s or all 0s. If all
1371 661 : // operands are constants, ConstantExpr::getSelect() can handle the cases
1372 : // such as select vectors.
1373 : if (TrueC && FalseC) {
1374 : if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1375 793144 : SimplifiedValues[&SI] = C;
1376 : return true;
1377 : }
1378 1758 : }
1379 1758 : return Base::visitSelectInst(SI);
1380 : }
1381 :
1382 791386 : // Condition is either all 1s or all 0s. SI can be simplified.
1383 : if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1384 : SimplifiedValues[&SI] = SelectedC;
1385 791384 : return true;
1386 : }
1387 :
1388 : if (!CheckSROA)
1389 791384 : return true;
1390 791384 :
1391 : std::pair<Value *, APInt> BaseAndOffset =
1392 : ConstantOffsetPtrs.lookup(SelectedV);
1393 791386 : if (BaseAndOffset.first) {
1394 : ConstantOffsetPtrs[&SI] = BaseAndOffset;
1395 791386 :
1396 : Value *SROAArg;
1397 : DenseMap<Value *, int>::iterator CostIt;
1398 : if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
1399 : SROAArgValues[&SI] = SROAArg;
1400 : }
1401 :
1402 : return true;
1403 : }
1404 20289 :
1405 : bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1406 : // We model unconditional switches as free, see the comments on handling
1407 : // branches.
1408 40578 : if (isa<ConstantInt>(SI.getCondition()))
1409 : return true;
1410 19843 : if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1411 : if (isa<ConstantInt>(V))
1412 19843 : return true;
1413 :
1414 : // Assume the most general case where the switch is lowered into
1415 : // either a jump table, bit test, or a balanced binary tree consisting of
1416 : // case clusters without merging adjacent clusters with the same
1417 : // destination. We do not consider the switches that are lowered with a mix
1418 : // of jump table/bit test/binary search tree. The cost of the switch is
1419 : // proportional to the size of the tree or the size of jump table range.
1420 446 : //
1421 446 : // NB: We convert large switches which are just used to initialize large phi
1422 : // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1423 892 : // inlining those. It will prevent inlining in cases where the optimization
1424 446 : // does not (yet) fire.
1425 :
1426 : // Maximum valid cost increased in this function.
1427 574 : int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
1428 :
1429 : // Exit early for a large switch, assuming one case needs at least one
1430 446 : // instruction.
1431 : // FIXME: This is not true for a bit test, but ignore such case for now to
1432 446 : // save compile-time.
1433 : int64_t CostLowerBound =
1434 : std::min((int64_t)CostUpperBound,
1435 0 : (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
1436 :
1437 230078 : if (CostLowerBound > Threshold && !ComputeFullInlineCost) {
1438 230078 : Cost = CostLowerBound;
1439 0 : return false;
1440 : }
1441 :
1442 716568 : unsigned JumpTableSize = 0;
1443 : unsigned NumCaseCluster =
1444 : TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
1445 :
1446 : // If suitable for a jump table, consider the cost for the table size and
1447 1082043 : // branch to destination.
1448 365478 : if (JumpTableSize) {
1449 365478 : int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
1450 : 4 * InlineConstants::InstrCost;
1451 :
1452 12146 : Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
1453 12146 : return false;
1454 : }
1455 :
1456 : // Considering forming a binary search, we should find the number of nodes
1457 : // which is same as the number of comparisons when lowered. For a given
1458 : // number of clusters, n, we can define a recursive function, f(n), to find
1459 8724 : // the number of nodes in the tree. The recursion is :
1460 : // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
1461 : // and f(n) = n, when n <= 3.
1462 12282 : // This will lead a binary tree where the leaf should be either f(2) or f(3)
1463 : // when n > 3. So, the number of comparisons from leaves should be n, while
1464 24292 : // the number of non-leaf should be :
1465 : // 2^(log2(n) - 1) - 1
1466 : // = 2^log2(n) * 2^-1 - 1
1467 : // = n / 2 - 1.
1468 12005 : // Considering comparisons from leaf and non-leaf nodes, we can estimate the
1469 1 : // number of comparisons in a simple closed form :
1470 1 : // n + n / 2 - 1 = n * 3 / 2 - 1
1471 : if (NumCaseCluster <= 3) {
1472 : // Suppose a comparison includes one compare and one conditional branch.
1473 12004 : Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
1474 9577 : return false;
1475 : }
1476 :
1477 2427 : int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
1478 : int64_t SwitchCost =
1479 2427 : ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
1480 2427 :
1481 1 : Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
1482 : return false;
1483 : }
1484 1 :
1485 1 : bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1486 1 : // We never want to inline functions that contain an indirectbr. This is
1487 : // incorrect because all the blockaddress's (in static global initializers
1488 : // for example) would be referring to the original function, and this
1489 : // indirect jump would jump from the inlined copy of the function into the
1490 2426 : // original function which is extremely undefined behavior.
1491 : // FIXME: This logic isn't really right; we can safely inline functions with
1492 : // indirectbr's as long as no other function or global references the
1493 : // blockaddress of a block within the current function.
1494 141 : HasIndirectBr = true;
1495 141 : return false;
1496 132 : }
1497 :
1498 : bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1499 : // FIXME: It's not clear that a single instruction is an accurate model for
1500 : // the inline cost of a resume instruction.
1501 2 : return false;
1502 1 : }
1503 1 :
1504 1 : bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1505 : // FIXME: It's not clear that a single instruction is an accurate model for
1506 : // the inline cost of a cleanupret instruction.
1507 1 : return false;
1508 : }
1509 :
1510 : bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1511 : // FIXME: It's not clear that a single instruction is an accurate model for
1512 129 : // the inline cost of a catchret instruction.
1513 129 : return false;
1514 : }
1515 :
1516 10 : bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1517 : // FIXME: It might be reasonably to discount the cost of instructions leading
1518 : // to unreachable as they have the lowest possible impact on both runtime and
1519 : // code size.
1520 4 : return true; // No actual code is needed for unreachable.
1521 4 : }
1522 3 :
1523 : bool CallAnalyzer::visitInstruction(Instruction &I) {
1524 : // Some instructions are free. All of the free intrinsics can also be
1525 3 : // handled by SROA, etc.
1526 3 : if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1527 2 : return true;
1528 :
1529 : // We found something we don't understand or can't handle. Mark any SROA-able
1530 : // values in the operand list as no longer viable.
1531 : for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1532 : disableSROA(*OI);
1533 10588 :
1534 : return false;
1535 : }
1536 10588 :
1537 : /// Analyze a basic block for its contribution to the inline cost.
1538 21176 : ///
1539 181 : /// This method walks the analyzer over every instruction in the given basic
1540 : /// block and accounts for their cost during inlining at this callsite. It
1541 : /// aborts early if the threshold has been exceeded or an impossible to inline
1542 : /// construct has been detected. It returns false if inlining is no longer
1543 : /// viable, and true if inlining remains viable.
1544 : InlineResult
1545 : CallAnalyzer::analyzeBlock(BasicBlock *BB,
1546 : SmallPtrSetImpl<const Value *> &EphValues) {
1547 : for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1548 : // FIXME: Currently, the number of instructions in a function regardless of
1549 : // our ability to simplify them during inline to constants or dead code,
1550 : // are actually used by the vector bonus heuristic. As long as that's true,
1551 : // we have to special case debug intrinsics here to prevent differences in
1552 : // inlining due to debug symbols. Eventually, the number of unsimplified
1553 : // instructions shouldn't factor into the cost computation, but until then,
1554 : // hack around it here.
1555 : if (isa<DbgInfoIntrinsic>(I))
1556 : continue;
1557 :
1558 : // Skip ephemeral values.
1559 : if (EphValues.count(&*I))
1560 : continue;
1561 :
1562 20814 : ++NumInstructions;
1563 10407 : if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1564 : ++NumVectorInstructions;
1565 10407 :
1566 2411 : // If the instruction simplified to a constant, there is no cost to this
1567 2411 : // instruction. Visit the instructions using our InstVisitor to account for
1568 : // all of the per-instruction logic. The visit tree returns true if we
1569 : // consumed the instruction in any way, and false if the instruction's base
1570 7996 : // cost should count against inlining.
1571 : if (Base::visit(&*I))
1572 7996 : ++NumInstructionsSimplified;
1573 : else
1574 : Cost += InlineConstants::InstrCost;
1575 :
1576 7996 : using namespace ore;
1577 4592 : // If the visit this instruction detected an uninlinable pattern, abort.
1578 : InlineResult IR;
1579 : if (IsRecursiveCall)
1580 4592 : IR = "recursive";
1581 4592 : else if (ExposesReturnsTwice)
1582 : IR = "exposes returns twice";
1583 : else if (HasDynamicAlloca)
1584 : IR = "dynamic alloca";
1585 : else if (HasIndirectBr)
1586 : IR = "indirect branch";
1587 : else if (HasUninlineableIntrinsic)
1588 : IR = "uninlinable intrinsic";
1589 : else if (InitsVargArgs)
1590 : IR = "varargs";
1591 : if (!IR) {
1592 : if (ORE)
1593 : ORE->emit([&]() {
1594 : return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1595 : CandidateCS.getInstruction())
1596 : << NV("Callee", &F) << " has uninlinable pattern ("
1597 : << NV("InlineResult", IR.message)
1598 : << ") and cost is not fully computed";
1599 3404 : });
1600 : return IR;
1601 3318 : }
1602 3318 :
1603 : // If the caller is a recursive function then we don't want to inline
1604 : // functions which allocate a lot of stack space because it would increase
1605 86 : // the caller stack usage dramatically.
1606 86 : if (IsCallerRecursive &&
1607 : AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1608 : InlineResult IR = "recursive and allocates too much stack space";
1609 86 : if (ORE)
1610 86 : ORE->emit([&]() {
1611 : return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1612 : CandidateCS.getInstruction())
1613 0 : << NV("Callee", &F) << " is " << NV("InlineResult", IR.message)
1614 : << ". Cost is not fully computed";
1615 : });
1616 : return IR;
1617 : }
1618 :
1619 : // Check if we've past the maximum possible threshold so we don't spin in
1620 : // huge basic blocks that will never inline.
1621 : if (Cost >= Threshold && !ComputeFullInlineCost)
1622 0 : return false;
1623 0 : }
1624 :
1625 : return true;
1626 0 : }
1627 :
1628 : /// Compute the base pointer and cumulative constant offsets for V.
1629 0 : ///
1630 : /// This strips all constant offsets off of V, leaving it the base pointer, and
1631 : /// accumulates the total constant offset applied in the returned constant. It
1632 0 : /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1633 : /// no constant offsets applied.
1634 : ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1635 0 : if (!V->getType()->isPointerTy())
1636 : return nullptr;
1637 :
1638 0 : unsigned AS = V->getType()->getPointerAddressSpace();
1639 : unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
1640 : APInt Offset = APInt::getNullValue(IntPtrWidth);
1641 0 :
1642 : // Even though we don't look through PHI nodes, we could be called on an
1643 : // instruction in an unreachable block, which may be on a cycle.
1644 0 : SmallPtrSet<Value *, 4> Visited;
1645 : Visited.insert(V);
1646 : do {
1647 : if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1648 0 : if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1649 : return nullptr;
1650 : V = GEP->getPointerOperand();
1651 1286296 : } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1652 : V = cast<Operator>(V)->getOperand(0);
1653 : } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1654 1286296 : if (GA->isInterposable())
1655 : break;
1656 : V = GA->getAliasee();
1657 : } else {
1658 : break;
1659 3596096 : }
1660 2640784 : assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1661 : } while (Visited.insert(V).second);
1662 :
1663 : Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS);
1664 : return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1665 : }
1666 :
1667 : /// Find dead blocks due to deleted CFG edges during inlining.
1668 : ///
1669 : /// If we know the successor of the current block, \p CurrBB, has to be \p
1670 : /// NextBB, the other successors of \p CurrBB are dead if these successors have
1671 : /// no live incoming CFG edges. If one block is found to be dead, we can
1672 : /// continue growing the dead block list by checking the successors of the dead
1673 1335000 : /// blocks to see if all their incoming edges are dead or not.
1674 : void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1675 20376938 : auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1676 : // A CFG edge is dead if the predecessor is dead or the predessor has a
1677 : // known successor which is not the one under exam.
1678 : return (DeadBlocks.count(Pred) ||
1679 : (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1680 : };
1681 :
1682 : auto IsNewlyDead = [&](BasicBlock *BB) {
1683 19133463 : // If all the edges to a block are dead, the block is also dead.
1684 943578 : return (!DeadBlocks.count(BB) &&
1685 : llvm::all_of(predecessors(BB),
1686 : [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1687 18189888 : };
1688 :
1689 : for (BasicBlock *Succ : successors(CurrBB)) {
1690 18189885 : if (Succ == NextBB || !IsNewlyDead(Succ))
1691 18189885 : continue;
1692 24 : SmallVector<BasicBlock *, 4> NewDead;
1693 : NewDead.push_back(Succ);
1694 : while (!NewDead.empty()) {
1695 : BasicBlock *Dead = NewDead.pop_back_val();
1696 : if (DeadBlocks.insert(Dead))
1697 : // Continue growing the dead block lists.
1698 : for (BasicBlock *S : successors(Dead))
1699 36379770 : if (IsNewlyDead(S))
1700 3035908 : NewDead.push_back(S);
1701 : }
1702 15153977 : }
1703 : }
1704 :
1705 : /// Analyze a call site for potential inlining.
1706 : ///
1707 18189885 : /// Returns true if inlining this call is viable, and false if it is not
1708 1758 : /// viable. It computes the cost and adjusts the threshold based on numerous
1709 18188127 : /// factors and heuristics. If this method returns false but the computed cost
1710 4 : /// is below the computed threshold, then inlining was forcibly disabled by
1711 18188123 : /// some artifact of the routine.
1712 7 : InlineResult CallAnalyzer::analyzeCall(CallSite CS) {
1713 18188116 : ++NumCallsAnalyzed;
1714 0 :
1715 18188116 : // Perform some tweaks to the cost and threshold based on the direct
1716 7 : // callsite information.
1717 18188109 :
1718 661 : // We want to more aggressively inline vector-dense kernels, so up the
1719 18189885 : // threshold, and we'll lower it if the % of vector instructions gets too
1720 2437 : // low. Note that these bonuses are some what arbitrary and evolved over time
1721 39 : // by accident as much as because they are principled bonuses.
1722 : //
1723 : // FIXME: It would be nice to remove all such bonuses. At least it would be
1724 : // nice to base the bonus values on something more scientific.
1725 : assert(NumInstructions == 0);
1726 : assert(NumVectorInstructions == 0);
1727 :
1728 91525 : // Update the threshold based on callsite properties
1729 : updateThreshold(CS, F);
1730 :
1731 : // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1732 : // this Threshold any time, and cost cannot decrease, we can stop processing
1733 : // the rest of the function body.
1734 18187448 : Threshold += (SingleBBBonus + VectorBonus);
1735 894706 :
1736 : // Give out bonuses for the callsite, as the instructions setting them up
1737 6516 : // will be gone after inlining.
1738 3 : Cost -= getCallsiteCost(CS, DL);
1739 :
1740 : // If this function uses the coldcc calling convention, prefer not to inline
1741 : // it.
1742 : if (F.getCallingConv() == CallingConv::Cold)
1743 : Cost += InlineConstants::ColdccPenalty;
1744 6516 :
1745 : // Check if we're done. This can happen due to bonuses and penalties.
1746 : if (Cost >= Threshold && !ComputeFullInlineCost)
1747 : return "high cost";
1748 :
1749 18180932 : if (F.empty())
1750 82572 : return true;
1751 :
1752 : Function *Caller = CS.getInstruction()->getFunction();
1753 1243475 : // Check if the caller function is recursive itself.
1754 : for (User *U : Caller->users()) {
1755 : CallSite Site(U);
1756 : if (!Site)
1757 : continue;
1758 : Instruction *I = Site.getInstruction();
1759 : if (I->getFunction() == Caller) {
1760 : IsCallerRecursive = true;
1761 : break;
1762 527549 : }
1763 1055098 : }
1764 :
1765 : // Populate our simplified values by mapping from function arguments to call
1766 : // arguments with known important simplifications.
1767 472189 : CallSite::arg_iterator CAI = CS.arg_begin();
1768 472189 : for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1769 : FAI != FAE; ++FAI, ++CAI) {
1770 : assert(CAI != CS.arg_end());
1771 : if (Constant *C = dyn_cast<Constant>(CAI))
1772 : SimplifiedValues[&*FAI] = C;
1773 472189 :
1774 : Value *PtrArg = *CAI;
1775 611435 : if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1776 102429 : ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1777 5200 :
1778 97229 : // We can SROA any pointer arguments derived from alloca instructions.
1779 321299 : if (isa<AllocaInst>(PtrArg)) {
1780 84034 : SROAArgValues[&*FAI] = PtrArg;
1781 : SROAArgCosts[PtrArg] = 0;
1782 : }
1783 : }
1784 0 : }
1785 : NumConstantArgs = SimplifiedValues.size();
1786 : NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1787 : NumAllocaArgs = SROAArgValues.size();
1788 :
1789 139246 : // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1790 : // the ephemeral values multiple times (and they're completely determined by
1791 466989 : // the callee, so this is purely duplicate work).
1792 466989 : SmallPtrSet<const Value *, 32> EphValues;
1793 : CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
1794 :
1795 : // The worklist of live basic blocks in the callee *after* inlining. We avoid
1796 : // adding basic blocks of the callee which can be proven to be dead for this
1797 : // particular call site in order to get more accurate cost estimates. This
1798 : // requires a somewhat heavyweight iteration pattern: we need to walk the
1799 : // basic blocks in a breadth-first order as we insert live successors. To
1800 : // accomplish this, prioritizing for small iterations because we exit after
1801 : // crossing our threshold, we use a small-size optimized SetVector.
1802 9376 : typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1803 : SmallPtrSet<BasicBlock *, 16>>
1804 : BBSetVector;
1805 : BBSetVector BBWorklist;
1806 : BBWorklist.insert(&F.getEntryBlock());
1807 : bool SingleBB = true;
1808 9376 : // Note that we *must not* cache the size, this loop grows the worklist.
1809 : for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1810 : // Bail out the moment we cross the threshold. This means we'll under-count
1811 : // the cost, but only when undercounting doesn't matter.
1812 : if (Cost >= Threshold && !ComputeFullInlineCost)
1813 : break;
1814 32064 :
1815 9376 : BasicBlock *BB = BBWorklist[Idx];
1816 : if (BB->empty())
1817 37778 : continue;
1818 19026 :
1819 11290 : // Disallow inlining a blockaddress. A blockaddress only has defined
1820 : // behavior for an indirect branch in the same function, and we do not
1821 7736 : // currently support inlining indirect branches. But, the inliner may not
1822 22947 : // see an indirect branch that ends up being dead code at a particular call
1823 15211 : // site. If the blockaddress escapes the function, e.g., via a global
1824 15211 : // variable, inlining may lead to an invalid cross-function reference.
1825 : if (BB->hasAddressTaken())
1826 46248 : return "blockaddress";
1827 15826 :
1828 7475 : // Analyze the cost of this block. If we blow through the threshold, this
1829 : // returns false, and we can bail on out.
1830 : InlineResult IR = analyzeBlock(BB, EphValues);
1831 9376 : if (!IR)
1832 : return IR;
1833 :
1834 : Instruction *TI = BB->getTerminator();
1835 :
1836 : // Add in the live successors by first checking whether we have terminator
1837 : // that may be simplified based on the values simplified by this call.
1838 : if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1839 : if (BI->isConditional()) {
1840 306857 : Value *Cond = BI->getCondition();
1841 : if (ConstantInt *SimpleCond =
1842 : dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1843 : BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
1844 : BBWorklist.insert(NextBB);
1845 : KnownSuccessors[BB] = NextBB;
1846 : findDeadBlocks(BB, NextBB);
1847 : continue;
1848 : }
1849 : }
1850 : } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1851 : Value *Cond = SI->getCondition();
1852 : if (ConstantInt *SimpleCond =
1853 : dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1854 : BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
1855 : BBWorklist.insert(NextBB);
1856 : KnownSuccessors[BB] = NextBB;
1857 306857 : findDeadBlocks(BB, NextBB);
1858 : continue;
1859 : }
1860 : }
1861 :
1862 306857 : // If we're unable to select a particular successor, just count all of
1863 : // them.
1864 : for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1865 : ++TIdx)
1866 306857 : BBWorklist.insert(TI->getSuccessor(TIdx));
1867 :
1868 : // If we had any successors at this point, than post-inlining is likely to
1869 : // have them as well. Note that we assume any basic blocks which existed
1870 613714 : // due to branches or switches which folded above will also fold after
1871 11 : // inlining.
1872 : if (SingleBB && TI->getNumSuccessors() > 1) {
1873 : // Take off the bonus we applied to the threshold.
1874 306857 : Threshold -= SingleBBBonus;
1875 3 : SingleBB = false;
1876 : }
1877 306854 : }
1878 18 :
1879 : bool OnlyOneCallAndLocalLinkage =
1880 : F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1881 : // If this is a noduplicate call, we can still inline as long as
1882 990477 : // inlining this would cause the removal of the caller (so the instruction
1883 : // is not actually duplicated, just moved).
1884 695280 : if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1885 : return "noduplicate";
1886 :
1887 624240 : // We applied the maximum possible vector bonus at the beginning. Now,
1888 11639 : // subtract the excess bonus, if any, from the Threshold before
1889 : // comparing against Cost.
1890 : if (NumVectorInstructions <= NumInstructions / 10)
1891 : Threshold -= VectorBonus;
1892 : else if (NumVectorInstructions <= NumInstructions / 2)
1893 : Threshold -= VectorBonus/2;
1894 :
1895 : return Cost < std::max(1, Threshold);
1896 834385 : }
1897 834385 :
1898 : #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1899 : /// Dump stats about this call's analysis.
1900 71067 : LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1901 : #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n"
1902 527549 : DEBUG_PRINT_STAT(NumConstantArgs);
1903 527549 : DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1904 933978 : DEBUG_PRINT_STAT(NumAllocaArgs);
1905 : DEBUG_PRINT_STAT(NumConstantPtrCmps);
1906 : DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1907 466989 : DEBUG_PRINT_STAT(NumInstructionsSimplified);
1908 212262 : DEBUG_PRINT_STAT(NumInstructions);
1909 212262 : DEBUG_PRINT_STAT(SROACostSavings);
1910 : DEBUG_PRINT_STAT(SROACostSavingsLost);
1911 : DEBUG_PRINT_STAT(LoadEliminationCost);
1912 : DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1913 306836 : DEBUG_PRINT_STAT(Cost);
1914 306836 : DEBUG_PRINT_STAT(Threshold);
1915 306836 : #undef DEBUG_PRINT_STAT
1916 : }
1917 : #endif
1918 :
1919 : /// Test that there are no attribute conflicts between Caller and Callee
1920 : /// that prevent inlining.
1921 613672 : static bool functionsHaveCompatibleAttributes(Function *Caller,
1922 : Function *Callee,
1923 : TargetTransformInfo &TTI) {
1924 : return TTI.areInlineCompatible(Caller, Callee) &&
1925 : AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1926 : }
1927 :
1928 : int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
1929 : int Cost = 0;
1930 : for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1931 : if (CS.isByValArgument(I)) {
1932 : // We approximate the number of loads and stores needed by dividing the
1933 306836 : // size of the byval type by the target's pointer size.
1934 613672 : PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1935 : unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1936 : unsigned AS = PTy->getAddressSpace();
1937 1857147 : unsigned PointerSize = DL.getPointerSizeInBits(AS);
1938 : // Ceiling division.
1939 : unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1940 1339462 :
1941 : // If it generates more than 8 stores it is likely to be expanded as an
1942 : // inline memcpy so we take that as an upper bound. Otherwise we assume
1943 1335002 : // one load and one store per word copied.
1944 1335002 : // FIXME: The maxStoresPerMemcpy setting from the target should be used
1945 9376 : // here instead of a magic number of 8, but it's not available via
1946 : // DataLayout.
1947 : NumStores = std::min(NumStores, 8U);
1948 :
1949 : Cost += 2 * NumStores * InlineConstants::InstrCost;
1950 : } else {
1951 : // For non-byval arguments subtract off one instruction per call
1952 : // argument.
1953 1335002 : Cost += InlineConstants::InstrCost;
1954 2 : }
1955 : }
1956 : // The call instruction also disappears after inlining.
1957 : Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
1958 1335000 : return Cost;
1959 1335000 : }
1960 91525 :
1961 : InlineCost llvm::getInlineCost(
1962 1243475 : CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
1963 : std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1964 : Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1965 : ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1966 : return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
1967 714323 : GetAssumptionCache, GetBFI, PSI, ORE);
1968 : }
1969 :
1970 728792 : InlineCost llvm::getInlineCost(
1971 9195 : CallSite CS, Function *Callee, const InlineParams &Params,
1972 9195 : TargetTransformInfo &CalleeTTI,
1973 9195 : std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1974 9195 : Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1975 : ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1976 :
1977 : // Cannot inline indirect calls.
1978 : if (!Callee)
1979 : return llvm::InlineCost::getNever("indirect call");
1980 :
1981 16068 : // Never inline calls with byval arguments that does not have the alloca
1982 181 : // address space. Since byval arguments can be replaced with a copy to an
1983 181 : // alloca, the inlined code would need to be adjusted to handle that the
1984 181 : // argument is in the alloca address space (so it is a little bit complicated
1985 181 : // to solve).
1986 : unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
1987 : for (unsigned I = 0, E = CS.arg_size(); I != E; ++I)
1988 : if (CS.isByValArgument(I)) {
1989 : PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1990 : if (PTy->getAddressSpace() != AllocaAS)
1991 : return llvm::InlineCost::getNever("byval arguments without alloca"
1992 2812570 : " address space");
1993 : }
1994 1578471 :
1995 : // Calls to functions with always-inline attributes should be inlined
1996 : // whenever possible.
1997 : if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1998 : if (isInlineViable(*Callee))
1999 : return llvm::InlineCost::getAlways("always inline attribute");
2000 1234099 : return llvm::InlineCost::getNever("inapplicable always inline attribute");
2001 : }
2002 175770 :
2003 : // Never inline functions with conflicting attributes (unless callee has
2004 : // always-inline attribute).
2005 : Function *Caller = CS.getCaller();
2006 : if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI))
2007 : return llvm::InlineCost::getNever("conflicting attributes");
2008 237045 :
2009 : // Don't inline this call if the caller has the optnone attribute.
2010 : if (Caller->hasFnAttribute(Attribute::OptimizeNone))
2011 : return llvm::InlineCost::getNever("optnone attribute");
2012 208818 :
2013 9 : // Don't inline a function that treats null pointer as valid into a caller
2014 : // that does not have this attribute.
2015 : if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
2016 : return llvm::InlineCost::getNever("nullptr definitions incompatible");
2017 :
2018 215300 : // Don't inline functions which can be interposed at link-time.
2019 215289 : if (Callee->isInterposable())
2020 11 : return llvm::InlineCost::getNever("interposable");
2021 7 :
2022 : // Don't inline functions marked noinline.
2023 498799 : if (Callee->hasFnAttribute(Attribute::NoInline))
2024 : return llvm::InlineCost::getNever("noinline function attribute");
2025 :
2026 : // Don't inline call sites marked noinline.
2027 : if (CS.isNoInline())
2028 : return llvm::InlineCost::getNever("noinline call site attribute");
2029 :
2030 : LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
2031 : << "... (caller:" << Caller->getName() << ")\n");
2032 :
2033 : CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS,
2034 : Params);
2035 : InlineResult ShouldInline = CA.analyzeCall(CS);
2036 :
2037 : LLVM_DEBUG(CA.dump());
2038 :
2039 : // Check if there was a reason to force inlining or no inlining.
2040 : if (!ShouldInline && CA.getCost() < CA.getThreshold())
2041 : return InlineCost::getNever(ShouldInline.message);
2042 : if (ShouldInline && CA.getCost() >= CA.getThreshold())
2043 : return InlineCost::getAlways("empty function");
2044 :
2045 : return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
2046 : }
2047 :
2048 : bool llvm::isInlineViable(Function &F) {
2049 334810 : bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
2050 : for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
2051 : // Disallow inlining of functions which contain indirect branches or
2052 669602 : // blockaddresses.
2053 334792 : if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
2054 : return false;
2055 :
2056 307739 : for (auto &II : *BI) {
2057 : CallSite CS(&II);
2058 836650 : if (!CS)
2059 528911 : continue;
2060 :
2061 : // Disallow recursive calls.
2062 5426 : if (&F == CS.getCalledFunction())
2063 5426 : return false;
2064 :
2065 : // Disallow calls which expose returns-twice to a function not previously
2066 : // attributed as such.
2067 5426 : if (!ReturnsTwice && CS.isCall() &&
2068 : cast<CallInst>(CS.getInstruction())->canReturnTwice())
2069 : return false;
2070 :
2071 : if (CS.getCalledFunction())
2072 : switch (CS.getCalledFunction()->getIntrinsicID()) {
2073 : default:
2074 : break;
2075 5426 : // Disallow inlining of @llvm.icall.branch.funnel because current
2076 : // backend can't separate call targets from call arguments.
2077 5426 : case llvm::Intrinsic::icall_branch_funnel:
2078 : // Disallow inlining functions that call @llvm.localescape. Doing this
2079 : // correctly would require major changes to the inliner.
2080 : case llvm::Intrinsic::localescape:
2081 523485 : // Disallow inlining of functions that initialize VarArgs with va_start.
2082 : case llvm::Intrinsic::vastart:
2083 : return false;
2084 : }
2085 307739 : }
2086 307739 : }
2087 :
2088 : return true;
2089 348535 : }
2090 :
2091 : // APIs to create InlineParams based on command line flags and/or other
2092 : // parameters.
2093 :
2094 : InlineParams llvm::getInlineParams(int Threshold) {
2095 348535 : InlineParams Params;
2096 :
2097 : // This field is the threshold to use for a callee by default. This is
2098 348546 : // derived from one or more of:
2099 : // * optimization or size-optimization levels,
2100 : // * a value passed to createFunctionInliningPass function, or
2101 : // * the -inline-threshold flag.
2102 : // If the -inline-threshold flag is explicitly specified, that is used
2103 : // irrespective of anything else.
2104 : if (InlineThreshold.getNumOccurrences() > 0)
2105 : Params.DefaultThreshold = InlineThreshold;
2106 348546 : else
2107 : Params.DefaultThreshold = Threshold;
2108 :
2109 : // Set the HintThreshold knob from the -inlinehint-threshold.
2110 : Params.HintThreshold = HintThreshold;
2111 :
2112 : // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2113 : Params.HotCallSiteThreshold = HotCallSiteThreshold;
2114 348546 :
2115 928864 : // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2116 580321 : // populate LocallyHotCallSiteThreshold. Later, we populate
2117 5454 : // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2118 5454 : // we know that optimization level is O3 (in the getInlineParams variant that
2119 : // takes the opt and size levels).
2120 : // FIXME: Remove this check (and make the assignment unconditional) after
2121 : // addressing size regression issues at O2.
2122 : if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2123 : Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2124 :
2125 348543 : // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
2126 13733 : Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2127 :
2128 : // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2129 : // -inlinehint-threshold commandline option is not explicitly given. If that
2130 : // option is present, then its value applies even for callees with size and
2131 : // minsize attributes.
2132 : // If the -inline-threshold is not specified, set the ColdThreshold from the
2133 : // -inlinecold-threshold even if it is not explicitly passed. If
2134 334810 : // -inline-threshold is specified, then -inlinecold-threshold needs to be
2135 : // explicitly specified to set the ColdThreshold knob
2136 : if (InlineThreshold.getNumOccurrences() == 0) {
2137 : Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2138 334765 : Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2139 : Params.ColdThreshold = ColdThreshold;
2140 : } else if (ColdThreshold.getNumOccurrences() > 0) {
2141 : Params.ColdThreshold = ColdThreshold;
2142 : }
2143 334762 : return Params;
2144 : }
2145 :
2146 : InlineParams llvm::getInlineParams() {
2147 : return getInlineParams(InlineThreshold);
2148 : }
2149 :
2150 : // Compute the default threshold for inlining based on the opt level and the
2151 334706 : // size opt level.
2152 : static int computeThresholdFromOptLevels(unsigned OptLevel,
2153 : unsigned SizeOptLevel) {
2154 : if (OptLevel > 2)
2155 306433 : return InlineConstants::OptAggressiveThreshold;
2156 : if (SizeOptLevel == 1) // -Os
2157 : return InlineConstants::OptSizeThreshold;
2158 : if (SizeOptLevel == 2) // -Oz
2159 : return InlineConstants::OptMinSizeThreshold;
2160 : return InlineThreshold;
2161 : }
2162 612822 :
2163 306411 : InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2164 : auto Params =
2165 : getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2166 : // At O3, use the value of -locally-hot-callsite-threshold option to populate
2167 : // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2168 306411 : // when it is specified explicitly.
2169 : if (OptLevel > 2)
2170 297457 : Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2171 : return Params;
2172 : }
|