Line data Source code
1 : //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // This file implements inline cost analysis.
11 : //
12 : //===----------------------------------------------------------------------===//
13 :
14 : #include "llvm/Analysis/InlineCost.h"
15 : #include "llvm/ADT/STLExtras.h"
16 : #include "llvm/ADT/SetVector.h"
17 : #include "llvm/ADT/SmallPtrSet.h"
18 : #include "llvm/ADT/SmallVector.h"
19 : #include "llvm/ADT/Statistic.h"
20 : #include "llvm/Analysis/AssumptionCache.h"
21 : #include "llvm/Analysis/CodeMetrics.h"
22 : #include "llvm/Analysis/ConstantFolding.h"
23 : #include "llvm/Analysis/InstructionSimplify.h"
24 : #include "llvm/Analysis/TargetTransformInfo.h"
25 : #include "llvm/IR/CallSite.h"
26 : #include "llvm/IR/CallingConv.h"
27 : #include "llvm/IR/DataLayout.h"
28 : #include "llvm/IR/GetElementPtrTypeIterator.h"
29 : #include "llvm/IR/GlobalAlias.h"
30 : #include "llvm/IR/InstVisitor.h"
31 : #include "llvm/IR/IntrinsicInst.h"
32 : #include "llvm/IR/Operator.h"
33 : #include "llvm/Support/Debug.h"
34 : #include "llvm/Support/raw_ostream.h"
35 :
36 : using namespace llvm;
37 :
38 : #define DEBUG_TYPE "inline-cost"
39 :
40 : STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
41 :
42 : namespace {
43 :
44 8090 : class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
45 : typedef InstVisitor<CallAnalyzer, bool> Base;
46 : friend class InstVisitor<CallAnalyzer, bool>;
47 :
48 : /// The TargetTransformInfo available for this compilation.
49 : const TargetTransformInfo &TTI;
50 :
51 : /// The cache of @llvm.assume intrinsics.
52 : AssumptionCacheTracker *ACT;
53 :
54 : // The called function.
55 : Function &F;
56 :
57 : // The candidate callsite being analyzed. Please do not use this to do
58 : // analysis in the caller function; we want the inline cost query to be
59 : // easily cacheable. Instead, use the cover function paramHasAttr.
60 : CallSite CandidateCS;
61 :
62 : int Threshold;
63 : int Cost;
64 :
65 : bool IsCallerRecursive;
66 : bool IsRecursiveCall;
67 : bool ExposesReturnsTwice;
68 : bool HasDynamicAlloca;
69 : bool ContainsNoDuplicateCall;
70 : bool HasReturn;
71 : bool HasIndirectBr;
72 : bool HasFrameEscape;
73 :
74 : /// Number of bytes allocated statically by the callee.
75 : uint64_t AllocatedSize;
76 : unsigned NumInstructions, NumVectorInstructions;
77 : int FiftyPercentVectorBonus, TenPercentVectorBonus;
78 : int VectorBonus;
79 :
80 : // While we walk the potentially-inlined instructions, we build up and
81 : // maintain a mapping of simplified values specific to this callsite. The
82 : // idea is to propagate any special information we have about arguments to
83 : // this call through the inlinable section of the function, and account for
84 : // likely simplifications post-inlining. The most important aspect we track
85 : // is CFG altering simplifications -- when we prove a basic block dead, that
86 : // can cause dramatic shifts in the cost of inlining a function.
87 : DenseMap<Value *, Constant *> SimplifiedValues;
88 :
89 : // Keep track of the values which map back (through function arguments) to
90 : // allocas on the caller stack which could be simplified through SROA.
91 : DenseMap<Value *, Value *> SROAArgValues;
92 :
93 : // The mapping of caller Alloca values to their accumulated cost savings. If
94 : // we have to disable SROA for one of the allocas, this tells us how much
95 : // cost must be added.
96 : DenseMap<Value *, int> SROAArgCosts;
97 :
98 : // Keep track of values which map to a pointer base and constant offset.
99 : DenseMap<Value *, std::pair<Value *, APInt> > ConstantOffsetPtrs;
100 :
101 : // Custom simplification helper routines.
102 : bool isAllocaDerivedArg(Value *V);
103 : bool lookupSROAArgAndCost(Value *V, Value *&Arg,
104 : DenseMap<Value *, int>::iterator &CostIt);
105 : void disableSROA(DenseMap<Value *, int>::iterator CostIt);
106 : void disableSROA(Value *V);
107 : void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
108 : int InstructionCost);
109 : bool isGEPOffsetConstant(GetElementPtrInst &GEP);
110 : bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
111 : bool simplifyCallSite(Function *F, CallSite CS);
112 : ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
113 :
114 : /// Return true if the given argument to the function being considered for
115 : /// inlining has the given attribute set either at the call site or the
116 : /// function declaration. Primarily used to inspect call site specific
117 : /// attributes since these can be more precise than the ones on the callee
118 : /// itself.
119 : bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
120 :
121 : /// Return true if the given value is known non null within the callee if
122 : /// inlined through this particular callsite.
123 : bool isKnownNonNullInCallee(Value *V);
124 :
125 : // Custom analysis routines.
126 : bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
127 :
128 : // Disable several entry points to the visitor so we don't accidentally use
129 : // them by declaring but not defining them here.
130 : void visit(Module *); void visit(Module &);
131 : void visit(Function *); void visit(Function &);
132 : void visit(BasicBlock *); void visit(BasicBlock &);
133 :
134 : // Provide base case for our instruction visit.
135 : bool visitInstruction(Instruction &I);
136 :
137 : // Our visit overrides.
138 : bool visitAlloca(AllocaInst &I);
139 : bool visitPHI(PHINode &I);
140 : bool visitGetElementPtr(GetElementPtrInst &I);
141 : bool visitBitCast(BitCastInst &I);
142 : bool visitPtrToInt(PtrToIntInst &I);
143 : bool visitIntToPtr(IntToPtrInst &I);
144 : bool visitCastInst(CastInst &I);
145 : bool visitUnaryInstruction(UnaryInstruction &I);
146 : bool visitCmpInst(CmpInst &I);
147 : bool visitSub(BinaryOperator &I);
148 : bool visitBinaryOperator(BinaryOperator &I);
149 : bool visitLoad(LoadInst &I);
150 : bool visitStore(StoreInst &I);
151 : bool visitExtractValue(ExtractValueInst &I);
152 : bool visitInsertValue(InsertValueInst &I);
153 : bool visitCallSite(CallSite CS);
154 : bool visitReturnInst(ReturnInst &RI);
155 : bool visitBranchInst(BranchInst &BI);
156 : bool visitSwitchInst(SwitchInst &SI);
157 : bool visitIndirectBrInst(IndirectBrInst &IBI);
158 : bool visitResumeInst(ResumeInst &RI);
159 : bool visitCleanupReturnInst(CleanupReturnInst &RI);
160 : bool visitCatchReturnInst(CatchReturnInst &RI);
161 : bool visitUnreachableInst(UnreachableInst &I);
162 :
163 : public:
164 1618 : CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT,
165 : Function &Callee, int Threshold, CallSite CSArg)
166 : : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold),
167 : Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
168 : ExposesReturnsTwice(false), HasDynamicAlloca(false),
169 : ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
170 : HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
171 : NumVectorInstructions(0), FiftyPercentVectorBonus(0),
172 : TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
173 : NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
174 : NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
175 8090 : SROACostSavings(0), SROACostSavingsLost(0) {}
176 :
177 : bool analyzeCall(CallSite CS);
178 :
179 : int getThreshold() { return Threshold; }
180 : int getCost() { return Cost; }
181 :
182 : // Keep a bunch of stats about the cost savings found so we can print them
183 : // out when debugging.
184 : unsigned NumConstantArgs;
185 : unsigned NumConstantOffsetPtrArgs;
186 : unsigned NumAllocaArgs;
187 : unsigned NumConstantPtrCmps;
188 : unsigned NumConstantPtrDiffs;
189 : unsigned NumInstructionsSimplified;
190 : unsigned SROACostSavings;
191 : unsigned SROACostSavingsLost;
192 :
193 : void dump();
194 : };
195 :
196 : } // namespace
197 :
198 : /// \brief Test whether the given value is an Alloca-derived function argument.
199 : bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
200 122 : return SROAArgValues.count(V);
201 : }
202 :
203 : /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
204 : /// Returns false if V does not map to a SROA-candidate.
205 51053 : bool CallAnalyzer::lookupSROAArgAndCost(
206 : Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
207 105792 : if (SROAArgValues.empty() || SROAArgCosts.empty())
208 : return false;
209 :
210 3487 : DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
211 10461 : if (ArgIt == SROAArgValues.end())
212 : return false;
213 :
214 1025 : Arg = ArgIt->second;
215 1025 : CostIt = SROAArgCosts.find(Arg);
216 3075 : return CostIt != SROAArgCosts.end();
217 : }
218 :
219 : /// \brief Disable SROA for the candidate marked by this cost iterator.
220 : ///
221 : /// This marks the candidate as no longer viable for SROA, and adds the cost
222 : /// savings associated with it back into the inline cost measurement.
223 : void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
224 : // If we're no longer able to perform SROA we need to undo its cost savings
225 : // and prevent subsequent analysis.
226 151 : Cost += CostIt->second;
227 151 : SROACostSavings -= CostIt->second;
228 151 : SROACostSavingsLost += CostIt->second;
229 151 : SROAArgCosts.erase(CostIt);
230 : }
231 :
232 : /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
233 31195 : void CallAnalyzer::disableSROA(Value *V) {
234 : Value *SROAArg;
235 : DenseMap<Value *, int>::iterator CostIt;
236 31195 : if (lookupSROAArgAndCost(V, SROAArg, CostIt))
237 149 : disableSROA(CostIt);
238 31195 : }
239 :
240 : /// \brief Accumulate the given cost for a particular SROA candidate.
241 : void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
242 : int InstructionCost) {
243 528 : CostIt->second += InstructionCost;
244 528 : SROACostSavings += InstructionCost;
245 : }
246 :
247 : /// \brief Check whether a GEP's indices are all constant.
248 : ///
249 : /// Respects any simplified values known during the analysis of this callsite.
250 3411 : bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
251 9869 : for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
252 13287 : if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
253 : return false;
254 :
255 : return true;
256 : }
257 :
258 : /// \brief Accumulate a constant GEP offset into an APInt if possible.
259 : ///
260 : /// Returns false if unable to compute the offset for any reason. Respects any
261 : /// simplified values known during the analysis of this callsite.
262 638 : bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
263 638 : const DataLayout &DL = F.getParent()->getDataLayout();
264 638 : unsigned IntPtrWidth = DL.getPointerSizeInBits();
265 : assert(IntPtrWidth == Offset.getBitWidth());
266 :
267 3848 : for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
268 : GTI != GTE; ++GTI) {
269 4032 : ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
270 1344 : if (!OpC)
271 180 : if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
272 2 : OpC = dyn_cast<ConstantInt>(SimpleOp);
273 1344 : if (!OpC)
274 58 : return false;
275 2495 : if (OpC->isZero()) continue;
276 :
277 : // Handle a struct index, which adds its field offset to the pointer.
278 706 : if (StructType *STy = dyn_cast<StructType>(*GTI)) {
279 276 : unsigned ElementIdx = OpC->getZExtValue();
280 276 : const StructLayout *SL = DL.getStructLayout(STy);
281 552 : Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
282 276 : continue;
283 : }
284 :
285 77 : APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
286 231 : Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
287 : }
288 580 : return true;
289 : }
290 :
291 1325 : bool CallAnalyzer::visitAlloca(AllocaInst &I) {
292 : // Check whether inlining will turn a dynamic alloca into a static
293 : // alloca, and handle that case.
294 1325 : if (I.isArrayAllocation()) {
295 18 : if (Constant *Size = SimplifiedValues.lookup(I.getArraySize())) {
296 5 : ConstantInt *AllocSize = dyn_cast<ConstantInt>(Size);
297 : assert(AllocSize && "Allocation size not a constant int?");
298 5 : Type *Ty = I.getAllocatedType();
299 10 : AllocatedSize += Ty->getPrimitiveSizeInBits() * AllocSize->getZExtValue();
300 5 : return Base::visitAlloca(I);
301 : }
302 : }
303 :
304 : // Accumulate the allocated size.
305 1320 : if (I.isStaticAlloca()) {
306 1318 : const DataLayout &DL = F.getParent()->getDataLayout();
307 1318 : Type *Ty = I.getAllocatedType();
308 1318 : AllocatedSize += DL.getTypeAllocSize(Ty);
309 : }
310 :
311 : // We will happily inline static alloca instructions.
312 1320 : if (I.isStaticAlloca())
313 1318 : return Base::visitAlloca(I);
314 :
315 : // FIXME: This is overly conservative. Dynamic allocas are inefficient for
316 : // a variety of reasons, and so we would like to not inline them into
317 : // functions which don't currently have a dynamic alloca. This simply
318 : // disables inlining altogether in the presence of a dynamic alloca.
319 2 : HasDynamicAlloca = true;
320 2 : return false;
321 : }
322 :
323 : bool CallAnalyzer::visitPHI(PHINode &I) {
324 : // FIXME: We should potentially be tracking values through phi nodes,
325 : // especially when they collapse to a single value due to deleted CFG edges
326 : // during inlining.
327 :
328 : // FIXME: We need to propagate SROA *disabling* through phi nodes, even
329 : // though we don't want to propagate it's bonuses. The idea is to disable
330 : // SROA if it *might* be used in an inappropriate manner.
331 :
332 : // Phi nodes are always zero-cost.
333 : return true;
334 : }
335 :
336 3963 : bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
337 : Value *SROAArg;
338 3963 : DenseMap<Value *, int>::iterator CostIt;
339 : bool SROACandidate = lookupSROAArgAndCost(I.getPointerOperand(),
340 3963 : SROAArg, CostIt);
341 :
342 : // Try to fold GEPs of constant-offset call site argument pointers. This
343 : // requires target data and inbounds GEPs.
344 3963 : if (I.isInBounds()) {
345 : // Check if we have a base + offset for the pointer.
346 3852 : Value *Ptr = I.getPointerOperand();
347 3852 : std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
348 3852 : if (BaseAndOffset.first) {
349 : // Check if the offset of this GEP is constant, and if so accumulate it
350 : // into Offset.
351 552 : if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) {
352 : // Non-constant GEPs aren't folded, and disable SROA.
353 45 : if (SROACandidate)
354 2 : disableSROA(CostIt);
355 : return false;
356 : }
357 :
358 : // Add the result as a new mapping to Base + Offset.
359 1521 : ConstantOffsetPtrs[&I] = BaseAndOffset;
360 :
361 : // Also handle SROA candidates here, we already know that the GEP is
362 : // all-constant indexed.
363 507 : if (SROACandidate)
364 572 : SROAArgValues[&I] = SROAArg;
365 :
366 : return true;
367 : }
368 : }
369 :
370 3411 : if (isGEPOffsetConstant(I)) {
371 3288 : if (SROACandidate)
372 14 : SROAArgValues[&I] = SROAArg;
373 :
374 : // Constant GEPs are modeled as free.
375 : return true;
376 : }
377 :
378 : // Variable GEPs will require math and will disable SROA.
379 123 : if (SROACandidate)
380 0 : disableSROA(CostIt);
381 : return false;
382 : }
383 :
384 2214 : bool CallAnalyzer::visitBitCast(BitCastInst &I) {
385 : // Propagate constants through bitcasts.
386 6642 : Constant *COp = dyn_cast<Constant>(I.getOperand(0));
387 2214 : if (!COp)
388 6636 : COp = SimplifiedValues.lookup(I.getOperand(0));
389 2214 : if (COp)
390 63 : if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
391 126 : SimplifiedValues[&I] = C;
392 63 : return true;
393 : }
394 :
395 : // Track base/offsets through casts
396 : std::pair<Value *, APInt> BaseAndOffset
397 4302 : = ConstantOffsetPtrs.lookup(I.getOperand(0));
398 : // Casts don't change the offset, just wrap it up.
399 2151 : if (BaseAndOffset.first)
400 369 : ConstantOffsetPtrs[&I] = BaseAndOffset;
401 :
402 : // Also look for SROA candidates here.
403 : Value *SROAArg;
404 : DenseMap<Value *, int>::iterator CostIt;
405 4302 : if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
406 98 : SROAArgValues[&I] = SROAArg;
407 :
408 : // Bitcasts are always zero cost.
409 2151 : return true;
410 : }
411 :
412 59 : bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
413 : // Propagate constants through ptrtoint.
414 177 : Constant *COp = dyn_cast<Constant>(I.getOperand(0));
415 59 : if (!COp)
416 177 : COp = SimplifiedValues.lookup(I.getOperand(0));
417 59 : if (COp)
418 1 : if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
419 2 : SimplifiedValues[&I] = C;
420 1 : return true;
421 : }
422 :
423 : // Track base/offset pairs when converted to a plain integer provided the
424 : // integer is large enough to represent the pointer.
425 58 : unsigned IntegerSize = I.getType()->getScalarSizeInBits();
426 58 : const DataLayout &DL = F.getParent()->getDataLayout();
427 58 : if (IntegerSize >= DL.getPointerSizeInBits()) {
428 : std::pair<Value *, APInt> BaseAndOffset
429 116 : = ConstantOffsetPtrs.lookup(I.getOperand(0));
430 58 : if (BaseAndOffset.first)
431 66 : ConstantOffsetPtrs[&I] = BaseAndOffset;
432 : }
433 :
434 : // This is really weird. Technically, ptrtoint will disable SROA. However,
435 : // unless that ptrtoint is *used* somewhere in the live basic blocks after
436 : // inlining, it will be nuked, and SROA should proceed. All of the uses which
437 : // would block SROA would also block SROA if applied directly to a pointer,
438 : // and so we can just add the integer in here. The only places where SROA is
439 : // preserved either cannot fire on an integer, or won't in-and-of themselves
440 : // disable SROA (ext) w/o some later use that we would see and disable.
441 : Value *SROAArg;
442 : DenseMap<Value *, int>::iterator CostIt;
443 116 : if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
444 8 : SROAArgValues[&I] = SROAArg;
445 :
446 58 : return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
447 : }
448 :
449 20 : bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
450 : // Propagate constants through ptrtoint.
451 60 : Constant *COp = dyn_cast<Constant>(I.getOperand(0));
452 20 : if (!COp)
453 60 : COp = SimplifiedValues.lookup(I.getOperand(0));
454 20 : if (COp)
455 3 : if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
456 6 : SimplifiedValues[&I] = C;
457 3 : return true;
458 : }
459 :
460 : // Track base/offset pairs when round-tripped through a pointer without
461 : // modifications provided the integer is not too large.
462 34 : Value *Op = I.getOperand(0);
463 17 : unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
464 17 : const DataLayout &DL = F.getParent()->getDataLayout();
465 17 : if (IntegerSize <= DL.getPointerSizeInBits()) {
466 17 : std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
467 17 : if (BaseAndOffset.first)
468 0 : ConstantOffsetPtrs[&I] = BaseAndOffset;
469 : }
470 :
471 : // "Propagate" SROA here in the same manner as we do for ptrtoint above.
472 : Value *SROAArg;
473 : DenseMap<Value *, int>::iterator CostIt;
474 17 : if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
475 0 : SROAArgValues[&I] = SROAArg;
476 :
477 17 : return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
478 : }
479 :
480 1181 : bool CallAnalyzer::visitCastInst(CastInst &I) {
481 : // Propagate constants through ptrtoint.
482 3543 : Constant *COp = dyn_cast<Constant>(I.getOperand(0));
483 1181 : if (!COp)
484 3543 : COp = SimplifiedValues.lookup(I.getOperand(0));
485 1181 : if (COp)
486 90 : if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
487 90 : SimplifiedValues[&I] = C;
488 45 : return true;
489 : }
490 :
491 : // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
492 2272 : disableSROA(I.getOperand(0));
493 :
494 1136 : return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
495 : }
496 :
497 1323 : bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
498 1323 : Value *Operand = I.getOperand(0);
499 2646 : Constant *COp = dyn_cast<Constant>(Operand);
500 1323 : if (!COp)
501 10 : COp = SimplifiedValues.lookup(Operand);
502 1323 : if (COp) {
503 1323 : const DataLayout &DL = F.getParent()->getDataLayout();
504 2646 : if (Constant *C = ConstantFoldInstOperands(I.getOpcode(), I.getType(),
505 1323 : COp, DL)) {
506 0 : SimplifiedValues[&I] = C;
507 0 : return true;
508 : }
509 : }
510 :
511 : // Disable any SROA on the argument to arbitrary unary operators.
512 1323 : disableSROA(Operand);
513 :
514 1323 : return false;
515 : }
516 :
517 : bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
518 27 : unsigned ArgNo = A->getArgNo();
519 27 : return CandidateCS.paramHasAttr(ArgNo+1, Attr);
520 : }
521 :
522 124 : bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
523 : // Does the *call site* have the NonNull attribute set on an argument? We
524 : // use the attribute on the call site to memoize any analysis done in the
525 : // caller. This will also trip if the callee function has a non-null
526 : // parameter attribute, but that's a less interesting case because hopefully
527 : // the callee would already have been simplified based on that.
528 124 : if (Argument *A = dyn_cast<Argument>(V))
529 27 : if (paramHasAttr(A, Attribute::NonNull))
530 : return true;
531 :
532 : // Is this an alloca in the caller? This is distinct from the attribute case
533 : // above because attributes aren't updated within the inliner itself and we
534 : // always want to catch the alloca derived case.
535 122 : if (isAllocaDerivedArg(V))
536 : // We can actually predict the result of comparisons between an
537 : // alloca-derived value and null. Note that this fires regardless of
538 : // SROA firing.
539 : return true;
540 :
541 120 : return false;
542 : }
543 :
544 4490 : bool CallAnalyzer::visitCmpInst(CmpInst &I) {
545 8980 : Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
546 : // First try to handle simplified comparisons.
547 4490 : if (!isa<Constant>(LHS))
548 8962 : if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
549 92 : LHS = SimpleLHS;
550 4490 : if (!isa<Constant>(RHS))
551 512 : if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
552 0 : RHS = SimpleRHS;
553 8980 : if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
554 202 : if (Constant *CRHS = dyn_cast<Constant>(RHS))
555 97 : if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
556 194 : SimplifiedValues[&I] = C;
557 97 : return true;
558 : }
559 : }
560 :
561 4393 : if (I.getOpcode() == Instruction::FCmp)
562 : return false;
563 :
564 : // Otherwise look for a comparison between constant offset pointers with
565 : // a common base.
566 : Value *LHSBase, *RHSBase;
567 8776 : APInt LHSOffset, RHSOffset;
568 17552 : std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
569 4388 : if (LHSBase) {
570 108 : std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
571 27 : if (RHSBase && LHSBase == RHSBase) {
572 : // We have common bases, fold the icmp to a constant based on the
573 : // offsets.
574 0 : Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
575 0 : Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
576 0 : if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
577 0 : SimplifiedValues[&I] = C;
578 0 : ++NumConstantPtrCmps;
579 0 : return true;
580 : }
581 : }
582 : }
583 :
584 : // If the comparison is an equality comparison with null, we can simplify it
585 : // if we know the value (argument) can't be null
586 10584 : if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
587 124 : isKnownNonNullInCallee(I.getOperand(0))) {
588 4 : bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
589 8 : SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
590 4 : : ConstantInt::getFalse(I.getType());
591 4 : return true;
592 : }
593 : // Finally check for SROA candidates in comparisons.
594 : Value *SROAArg;
595 4384 : DenseMap<Value *, int>::iterator CostIt;
596 4384 : if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
597 0 : if (isa<ConstantPointerNull>(I.getOperand(1))) {
598 0 : accumulateSROACost(CostIt, InlineConstants::InstrCost);
599 0 : return true;
600 : }
601 :
602 0 : disableSROA(CostIt);
603 : }
604 :
605 : return false;
606 : }
607 :
608 697 : bool CallAnalyzer::visitSub(BinaryOperator &I) {
609 : // Try to handle a special case: we can fold computing the difference of two
610 : // constant-related pointers.
611 1394 : Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
612 : Value *LHSBase, *RHSBase;
613 1394 : APInt LHSOffset, RHSOffset;
614 2788 : std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
615 697 : if (LHSBase) {
616 28 : std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
617 7 : if (RHSBase && LHSBase == RHSBase) {
618 : // We have common bases, fold the subtract to a constant based on the
619 : // offsets.
620 2 : Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
621 2 : Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
622 2 : if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
623 4 : SimplifiedValues[&I] = C;
624 2 : ++NumConstantPtrDiffs;
625 2 : return true;
626 : }
627 : }
628 : }
629 :
630 : // Otherwise, fall back to the generic logic for simplifying and handling
631 : // instructions.
632 695 : return Base::visitSub(I);
633 : }
634 :
635 9745 : bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
636 19490 : Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
637 9745 : const DataLayout &DL = F.getParent()->getDataLayout();
638 9745 : if (!isa<Constant>(LHS))
639 18506 : if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
640 37 : LHS = SimpleLHS;
641 9745 : if (!isa<Constant>(RHS))
642 9054 : if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
643 86 : RHS = SimpleRHS;
644 9745 : Value *SimpleV = nullptr;
645 9745 : if (auto FI = dyn_cast<FPMathOperator>(&I))
646 : SimpleV =
647 146 : SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
648 : else
649 19344 : SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
650 :
651 9745 : if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
652 82 : SimplifiedValues[&I] = C;
653 41 : return true;
654 : }
655 :
656 : // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
657 9704 : disableSROA(LHS);
658 9704 : disableSROA(RHS);
659 :
660 9704 : return false;
661 : }
662 :
663 5199 : bool CallAnalyzer::visitLoad(LoadInst &I) {
664 : Value *SROAArg;
665 5199 : DenseMap<Value *, int>::iterator CostIt;
666 5199 : if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
667 276 : if (I.isSimple()) {
668 276 : accumulateSROACost(CostIt, InlineConstants::InstrCost);
669 276 : return true;
670 : }
671 :
672 0 : disableSROA(CostIt);
673 : }
674 :
675 : return false;
676 : }
677 :
678 4086 : bool CallAnalyzer::visitStore(StoreInst &I) {
679 : Value *SROAArg;
680 4086 : DenseMap<Value *, int>::iterator CostIt;
681 4086 : if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
682 252 : if (I.isSimple()) {
683 252 : accumulateSROACost(CostIt, InlineConstants::InstrCost);
684 252 : return true;
685 : }
686 :
687 0 : disableSROA(CostIt);
688 : }
689 :
690 : return false;
691 : }
692 :
693 6 : bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
694 : // Constant folding for extract value is trivial.
695 12 : Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
696 6 : if (!C)
697 12 : C = SimplifiedValues.lookup(I.getAggregateOperand());
698 6 : if (C) {
699 6 : SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
700 2 : return true;
701 : }
702 :
703 : // SROA can look through these but give them a cost.
704 : return false;
705 : }
706 :
707 18 : bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
708 : // Constant folding for insert value is trivial.
709 36 : Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
710 18 : if (!AggC)
711 18 : AggC = SimplifiedValues.lookup(I.getAggregateOperand());
712 36 : Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
713 18 : if (!InsertedC)
714 36 : InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
715 18 : if (AggC && InsertedC) {
716 0 : SimplifiedValues[&I] = ConstantExpr::getInsertValue(AggC, InsertedC,
717 0 : I.getIndices());
718 0 : return true;
719 : }
720 :
721 : // SROA can look through these but give them a cost.
722 : return false;
723 : }
724 :
725 : /// \brief Try to simplify a call site.
726 : ///
727 : /// Takes a concrete function and callsite and tries to actually simplify it by
728 : /// analyzing the arguments and call itself with instsimplify. Returns true if
729 : /// it has simplified the callsite to some other entity (a constant), making it
730 : /// free.
731 5461 : bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
732 : // FIXME: Using the instsimplify logic directly for this is inefficient
733 : // because we have to continually rebuild the argument list even when no
734 : // simplifications can be performed. Until that is fixed with remapping
735 : // inside of instsimplify, directly constant fold calls here.
736 5461 : if (!canConstantFoldCallTo(F))
737 : return false;
738 :
739 : // Try to re-map the arguments to constants.
740 : SmallVector<Constant *, 4> ConstantArgs;
741 210 : ConstantArgs.reserve(CS.arg_size());
742 234 : for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
743 234 : I != E; ++I) {
744 225 : Constant *C = dyn_cast<Constant>(*I);
745 225 : if (!C)
746 633 : C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
747 225 : if (!C)
748 201 : return false; // This argument doesn't map to a constant.
749 :
750 24 : ConstantArgs.push_back(C);
751 : }
752 9 : if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
753 18 : SimplifiedValues[CS.getInstruction()] = C;
754 9 : return true;
755 : }
756 :
757 : return false;
758 : }
759 :
760 5484 : bool CallAnalyzer::visitCallSite(CallSite CS) {
761 5488 : if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
762 4 : !F.hasFnAttribute(Attribute::ReturnsTwice)) {
763 : // This aborts the entire analysis.
764 2 : ExposesReturnsTwice = true;
765 2 : return false;
766 : }
767 10947 : if (CS.isCall() &&
768 10930 : cast<CallInst>(CS.getInstruction())->cannotDuplicate())
769 6 : ContainsNoDuplicateCall = true;
770 :
771 5482 : if (Function *F = CS.getCalledFunction()) {
772 : // When we have a concrete function, first try to simplify it directly.
773 5461 : if (simplifyCallSite(F, CS))
774 : return true;
775 :
776 : // Next check if it is an intrinsic we know about.
777 : // FIXME: Lift this into part of the InstVisitor.
778 10904 : if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
779 2403 : switch (II->getIntrinsicID()) {
780 : default:
781 2332 : return Base::visitCallSite(CS);
782 :
783 : case Intrinsic::memset:
784 : case Intrinsic::memcpy:
785 : case Intrinsic::memmove:
786 : // SROA can usually chew through these intrinsics, but they aren't free.
787 : return false;
788 : case Intrinsic::localescape:
789 2 : HasFrameEscape = true;
790 2 : return false;
791 : }
792 : }
793 :
794 3049 : if (F == CS.getInstruction()->getParent()->getParent()) {
795 : // This flag will fully abort the analysis, so don't bother with anything
796 : // else.
797 27 : IsRecursiveCall = true;
798 27 : return false;
799 : }
800 :
801 3022 : if (TTI.isLoweredToCall(F)) {
802 : // We account for the average 1 instruction per call argument setup
803 : // here.
804 3021 : Cost += CS.arg_size() * InlineConstants::InstrCost;
805 :
806 : // Everything other than inline ASM will also have a significant cost
807 : // merely from making the call.
808 6042 : if (!isa<InlineAsm>(CS.getCalledValue()))
809 3021 : Cost += InlineConstants::CallPenalty;
810 : }
811 :
812 3022 : return Base::visitCallSite(CS);
813 : }
814 :
815 : // Otherwise we're in a very special case -- an indirect function call. See
816 : // if we can be particularly clever about this.
817 21 : Value *Callee = CS.getCalledValue();
818 :
819 : // First, pay the price of the argument setup. We account for the average
820 : // 1 instruction per call argument setup here.
821 21 : Cost += CS.arg_size() * InlineConstants::InstrCost;
822 :
823 : // Next, check if this happens to be an indirect function call to a known
824 : // function in this inline context. If not, we've done all we can.
825 63 : Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
826 21 : if (!F)
827 15 : return Base::visitCallSite(CS);
828 :
829 : // If we have a constant that we are calling as a function, we can peer
830 : // through it and see the function target. This happens not infrequently
831 : // during devirtualization and so we want to give it a hefty bonus for
832 : // inlining, but cap that bonus in the event that inlining wouldn't pan
833 : // out. Pretend to inline the function, with a custom threshold.
834 12 : CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS);
835 6 : if (CA.analyzeCall(CS)) {
836 : // We were able to inline the indirect call! Subtract the cost from the
837 : // bonus we want to apply, but don't go below zero.
838 12 : Cost -= std::max(0, InlineConstants::IndirectCallThreshold - CA.getCost());
839 : }
840 :
841 6 : return Base::visitCallSite(CS);
842 : }
843 :
844 : bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
845 : // At least one return instruction will be free after inlining.
846 1569 : bool Free = !HasReturn;
847 1569 : HasReturn = true;
848 : return Free;
849 : }
850 :
851 8068 : bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
852 : // We model unconditional branches as essentially free -- they really
853 : // shouldn't exist at all, but handling them makes the behavior of the
854 : // inliner more regular and predictable. Interestingly, conditional branches
855 : // which will fold away are also free.
856 25095 : return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
857 8862 : dyn_cast_or_null<ConstantInt>(
858 8068 : SimplifiedValues.lookup(BI.getCondition()));
859 : }
860 :
861 96 : bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
862 : // We model unconditional switches as free, see the comments on handling
863 : // branches.
864 192 : if (isa<ConstantInt>(SI.getCondition()))
865 : return true;
866 192 : if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
867 14 : if (isa<ConstantInt>(V))
868 : return true;
869 :
870 : // Otherwise, we need to accumulate a cost proportional to the number of
871 : // distinct successor blocks. This fan-out in the CFG cannot be represented
872 : // for free even if we can represent the core switch as a jumptable that
873 : // takes a single instruction.
874 : //
875 : // NB: We convert large switches which are just used to initialize large phi
876 : // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
877 : // inlining those. It will prevent inlining in cases where the optimization
878 : // does not (yet) fire.
879 82 : SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
880 82 : SuccessorBlocks.insert(SI.getDefaultDest());
881 164 : for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
882 264 : SuccessorBlocks.insert(I.getCaseSuccessor());
883 : // Add cost corresponding to the number of distinct destinations. The first
884 : // we model as free because of fallthrough.
885 82 : Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
886 82 : return false;
887 : }
888 :
889 : bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
890 : // We never want to inline functions that contain an indirectbr. This is
891 : // incorrect because all the blockaddress's (in static global initializers
892 : // for example) would be referring to the original function, and this
893 : // indirect jump would jump from the inlined copy of the function into the
894 : // original function which is extremely undefined behavior.
895 : // FIXME: This logic isn't really right; we can safely inline functions with
896 : // indirectbr's as long as no other function or global references the
897 : // blockaddress of a block within the current function.
898 0 : HasIndirectBr = true;
899 : return false;
900 : }
901 :
902 : bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
903 : // FIXME: It's not clear that a single instruction is an accurate model for
904 : // the inline cost of a resume instruction.
905 : return false;
906 : }
907 :
908 : bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
909 : // FIXME: It's not clear that a single instruction is an accurate model for
910 : // the inline cost of a cleanupret instruction.
911 : return false;
912 : }
913 :
914 : bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
915 : // FIXME: It's not clear that a single instruction is an accurate model for
916 : // the inline cost of a cleanupret instruction.
917 : return false;
918 : }
919 :
920 : bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
921 : // FIXME: It might be reasonably to discount the cost of instructions leading
922 : // to unreachable as they have the lowest possible impact on both runtime and
923 : // code size.
924 : return true; // No actual code is needed for unreachable.
925 : }
926 :
927 5572 : bool CallAnalyzer::visitInstruction(Instruction &I) {
928 : // Some instructions are free. All of the free intrinsics can also be
929 : // handled by SROA, etc.
930 5572 : if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
931 : return true;
932 :
933 : // We found something we don't understand or can't handle. Mark any SROA-able
934 : // values in the operand list as no longer viable.
935 19672 : for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
936 9328 : disableSROA(*OI);
937 :
938 : return false;
939 : }
940 :
941 :
942 : /// \brief Analyze a basic block for its contribution to the inline cost.
943 : ///
944 : /// This method walks the analyzer over every instruction in the given basic
945 : /// block and accounts for their cost during inlining at this callsite. It
946 : /// aborts early if the threshold has been exceeded or an impossible to inline
947 : /// construct has been detected. It returns false if inlining is no longer
948 : /// viable, and true if inlining remains viable.
949 9886 : bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
950 : SmallPtrSetImpl<const Value *> &EphValues) {
951 19772 : for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
952 : // FIXME: Currently, the number of instructions in a function regardless of
953 : // our ability to simplify them during inline to constants or dead code,
954 : // are actually used by the vector bonus heuristic. As long as that's true,
955 : // we have to special case debug intrinsics here to prevent differences in
956 : // inlining due to debug symbols. Eventually, the number of unsimplified
957 : // instructions shouldn't factor into the cost computation, but until then,
958 : // hack around it here.
959 50738 : if (isa<DbgInfoIntrinsic>(I))
960 : continue;
961 :
962 : // Skip ephemeral values.
963 101462 : if (EphValues.count(I))
964 : continue;
965 :
966 50725 : ++NumInstructions;
967 101448 : if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
968 8 : ++NumVectorInstructions;
969 :
970 : // If the instruction is floating point, and the target says this operation is
971 : // expensive or the function has the "use-soft-float" attribute, this may
972 : // eventually become a library call. Treat the cost as such.
973 101450 : if (I->getType()->isFloatingPointTy()) {
974 1312 : bool hasSoftFloatAttr = false;
975 :
976 : // If the function has the "use-soft-float" attribute, mark it as expensive.
977 2624 : if (F.hasFnAttribute("use-soft-float")) {
978 2434 : Attribute Attr = F.getFnAttribute("use-soft-float");
979 1217 : StringRef Val = Attr.getValueAsString();
980 1217 : if (Val == "true")
981 18 : hasSoftFloatAttr = true;
982 : }
983 :
984 1312 : if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
985 : hasSoftFloatAttr)
986 18 : Cost += InlineConstants::CallPenalty;
987 : }
988 :
989 : // If the instruction simplified to a constant, there is no cost to this
990 : // instruction. Visit the instructions using our InstVisitor to account for
991 : // all of the per-instruction logic. The visit tree returns true if we
992 : // consumed the instruction in any way, and false if the instruction's base
993 : // cost should count against inlining.
994 101450 : if (Base::visit(I))
995 18060 : ++NumInstructionsSimplified;
996 : else
997 32665 : Cost += InlineConstants::InstrCost;
998 :
999 : // If the visit this instruction detected an uninlinable pattern, abort.
1000 101419 : if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1001 101388 : HasIndirectBr || HasFrameEscape)
1002 : return false;
1003 :
1004 : // If the caller is a recursive function then we don't want to inline
1005 : // functions which allocate a lot of stack space because it would increase
1006 : // the caller stack usage dramatically.
1007 51208 : if (IsCallerRecursive &&
1008 516 : AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
1009 : return false;
1010 :
1011 : // Check if we've past the maximum possible threshold so we don't spin in
1012 : // huge basic blocks that will never inline.
1013 50691 : if (Cost > Threshold)
1014 : return false;
1015 : }
1016 :
1017 : return true;
1018 : }
1019 :
1020 : /// \brief Compute the base pointer and cumulative constant offsets for V.
1021 : ///
1022 : /// This strips all constant offsets off of V, leaving it the base pointer, and
1023 : /// accumulates the total constant offset applied in the returned constant. It
1024 : /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1025 : /// no constant offsets applied.
1026 1750 : ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1027 3500 : if (!V->getType()->isPointerTy())
1028 : return nullptr;
1029 :
1030 810 : const DataLayout &DL = F.getParent()->getDataLayout();
1031 810 : unsigned IntPtrWidth = DL.getPointerSizeInBits();
1032 810 : APInt Offset = APInt::getNullValue(IntPtrWidth);
1033 :
1034 : // Even though we don't look through PHI nodes, we could be called on an
1035 : // instruction in an unreachable block, which may be on a cycle.
1036 810 : SmallPtrSet<Value *, 4> Visited;
1037 810 : Visited.insert(V);
1038 320 : do {
1039 1940 : if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1040 89 : if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1041 : return nullptr;
1042 73 : V = GEP->getPointerOperand();
1043 1762 : } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1044 261 : V = cast<Operator>(V)->getOperand(0);
1045 1588 : } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1046 0 : if (GA->mayBeOverridden())
1047 : break;
1048 0 : V = GA->getAliasee();
1049 : } else {
1050 : break;
1051 : }
1052 : assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1053 320 : } while (Visited.insert(V).second);
1054 :
1055 794 : Type *IntPtrTy = DL.getIntPtrType(V->getContext());
1056 794 : return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1057 : }
1058 :
1059 : /// \brief Analyze a call site for potential inlining.
1060 : ///
1061 : /// Returns true if inlining this call is viable, and false if it is not
1062 : /// viable. It computes the cost and adjusts the threshold based on numerous
1063 : /// factors and heuristics. If this method returns false but the computed cost
1064 : /// is below the computed threshold, then inlining was forcibly disabled by
1065 : /// some artifact of the routine.
1066 1618 : bool CallAnalyzer::analyzeCall(CallSite CS) {
1067 : ++NumCallsAnalyzed;
1068 :
1069 : // Perform some tweaks to the cost and threshold based on the direct
1070 : // callsite information.
1071 :
1072 : // We want to more aggressively inline vector-dense kernels, so up the
1073 : // threshold, and we'll lower it if the % of vector instructions gets too
1074 : // low. Note that these bonuses are some what arbitrary and evolved over time
1075 : // by accident as much as because they are principled bonuses.
1076 : //
1077 : // FIXME: It would be nice to remove all such bonuses. At least it would be
1078 : // nice to base the bonus values on something more scientific.
1079 : assert(NumInstructions == 0);
1080 : assert(NumVectorInstructions == 0);
1081 1618 : FiftyPercentVectorBonus = 3 * Threshold / 2;
1082 1618 : TenPercentVectorBonus = 3 * Threshold / 4;
1083 1618 : const DataLayout &DL = F.getParent()->getDataLayout();
1084 :
1085 : // Track whether the post-inlining function would have more than one basic
1086 : // block. A single basic block is often intended for inlining. Balloon the
1087 : // threshold by 50% until we pass the single-BB phase.
1088 1618 : bool SingleBB = true;
1089 1618 : int SingleBBBonus = Threshold / 2;
1090 :
1091 : // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1092 : // this Threshold any time, and cost cannot decrease, we can stop processing
1093 : // the rest of the function body.
1094 1618 : Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
1095 :
1096 : // Give out bonuses per argument, as the instructions setting them up will
1097 : // be gone after inlining.
1098 1618 : for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1099 1754 : if (CS.isByValArgument(I)) {
1100 : // We approximate the number of loads and stores needed by dividing the
1101 : // size of the byval type by the target's pointer size.
1102 30 : PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1103 30 : unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1104 15 : unsigned PointerSize = DL.getPointerSizeInBits();
1105 : // Ceiling division.
1106 15 : unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1107 :
1108 : // If it generates more than 8 stores it is likely to be expanded as an
1109 : // inline memcpy so we take that as an upper bound. Otherwise we assume
1110 : // one load and one store per word copied.
1111 : // FIXME: The maxStoresPerMemcpy setting from the target should be used
1112 : // here instead of a magic number of 8, but it's not available via
1113 : // DataLayout.
1114 30 : NumStores = std::min(NumStores, 8U);
1115 :
1116 15 : Cost -= 2 * NumStores * InlineConstants::InstrCost;
1117 : } else {
1118 : // For non-byval arguments subtract off one instruction per call
1119 : // argument.
1120 1739 : Cost -= InlineConstants::InstrCost;
1121 : }
1122 : }
1123 :
1124 : // If there is only one call of the function, and it has internal linkage,
1125 : // the cost of inlining it drops dramatically.
1126 4592 : bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
1127 2011 : &F == CS.getCalledFunction();
1128 1618 : if (OnlyOneCallAndLocalLinkage)
1129 390 : Cost += InlineConstants::LastCallToStaticBonus;
1130 :
1131 : // If the instruction after the call, or if the normal destination of the
1132 : // invoke is an unreachable instruction, the function is noreturn. As such,
1133 : // there is little point in inlining this unless there is literally zero
1134 : // cost.
1135 1618 : Instruction *Instr = CS.getInstruction();
1136 1618 : if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
1137 96 : if (isa<UnreachableInst>(II->getNormalDest()->begin()))
1138 7 : Threshold = 0;
1139 4758 : } else if (isa<UnreachableInst>(++BasicBlock::iterator(Instr)))
1140 58 : Threshold = 0;
1141 :
1142 : // If this function uses the coldcc calling convention, prefer not to inline
1143 : // it.
1144 3236 : if (F.getCallingConv() == CallingConv::Cold)
1145 0 : Cost += InlineConstants::ColdccPenalty;
1146 :
1147 : // Check if we're done. This can happen due to bonuses and penalties.
1148 1618 : if (Cost > Threshold)
1149 : return false;
1150 :
1151 3236 : if (F.empty())
1152 : return true;
1153 :
1154 1618 : Function *Caller = CS.getInstruction()->getParent()->getParent();
1155 : // Check if the caller function is recursive itself.
1156 6988 : for (User *U : Caller->users()) {
1157 1082 : CallSite Site(U);
1158 1082 : if (!Site)
1159 : continue;
1160 1041 : Instruction *I = Site.getInstruction();
1161 1041 : if (I->getParent()->getParent() == Caller) {
1162 30 : IsCallerRecursive = true;
1163 30 : break;
1164 : }
1165 : }
1166 :
1167 : // Populate our simplified values by mapping from function arguments to call
1168 : // arguments with known important simplifications.
1169 1618 : CallSite::arg_iterator CAI = CS.arg_begin();
1170 6604 : for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1171 : FAI != FAE; ++FAI, ++CAI) {
1172 : assert(CAI != CS.arg_end());
1173 1750 : if (Constant *C = dyn_cast<Constant>(CAI))
1174 956 : SimplifiedValues[FAI] = C;
1175 :
1176 1750 : Value *PtrArg = *CAI;
1177 1750 : if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1178 3970 : ConstantOffsetPtrs[FAI] = std::make_pair(PtrArg, C->getValue());
1179 :
1180 : // We can SROA any pointer arguments derived from alloca instructions.
1181 794 : if (isa<AllocaInst>(PtrArg)) {
1182 860 : SROAArgValues[FAI] = PtrArg;
1183 860 : SROAArgCosts[PtrArg] = 0;
1184 : }
1185 : }
1186 : }
1187 3236 : NumConstantArgs = SimplifiedValues.size();
1188 3236 : NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1189 3236 : NumAllocaArgs = SROAArgValues.size();
1190 :
1191 : // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1192 : // the ephemeral values multiple times (and they're completely determined by
1193 : // the callee, so this is purely duplicate work).
1194 : SmallPtrSet<const Value *, 32> EphValues;
1195 1618 : CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F), EphValues);
1196 :
1197 : // The worklist of live basic blocks in the callee *after* inlining. We avoid
1198 : // adding basic blocks of the callee which can be proven to be dead for this
1199 : // particular call site in order to get more accurate cost estimates. This
1200 : // requires a somewhat heavyweight iteration pattern: we need to walk the
1201 : // basic blocks in a breadth-first order as we insert live successors. To
1202 : // accomplish this, prioritizing for small iterations because we exit after
1203 : // crossing our threshold, we use a small-size optimized SetVector.
1204 : typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1205 : SmallPtrSet<BasicBlock *, 16> > BBSetVector;
1206 1618 : BBSetVector BBWorklist;
1207 3236 : BBWorklist.insert(&F.getEntryBlock());
1208 : // Note that we *must not* cache the size, this loop grows the worklist.
1209 22878 : for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1210 : // Bail out the moment we cross the threshold. This means we'll under-count
1211 : // the cost, but only when undercounting doesn't matter.
1212 9901 : if (Cost > Threshold)
1213 : break;
1214 :
1215 19774 : BasicBlock *BB = BBWorklist[Idx];
1216 9887 : if (BB->empty())
1217 : continue;
1218 :
1219 : // Disallow inlining a blockaddress. A blockaddress only has defined
1220 : // behavior for an indirect branch in the same function, and we do not
1221 : // currently support inlining indirect branches. But, the inliner may not
1222 : // see an indirect branch that ends up being dead code at a particular call
1223 : // site. If the blockaddress escapes the function, e.g., via a global
1224 : // variable, inlining may lead to an invalid cross-function reference.
1225 9887 : if (BB->hasAddressTaken())
1226 : return false;
1227 :
1228 : // Analyze the cost of this block. If we blow through the threshold, this
1229 : // returns false, and we can bail on out.
1230 9886 : if (!analyzeBlock(BB, EphValues)) {
1231 99 : if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1232 68 : HasIndirectBr || HasFrameEscape)
1233 : return false;
1234 :
1235 : // If the caller is a recursive function then we don't want to inline
1236 : // functions which allocate a lot of stack space because it would increase
1237 : // the caller stack usage dramatically.
1238 35 : if (IsCallerRecursive &&
1239 3 : AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
1240 : return false;
1241 :
1242 : break;
1243 : }
1244 :
1245 9821 : TerminatorInst *TI = BB->getTerminator();
1246 :
1247 : // Add in the live successors by first checking whether we have terminator
1248 : // that may be simplified based on the values simplified by this call.
1249 9821 : if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1250 8067 : if (BI->isConditional()) {
1251 4527 : Value *Cond = BI->getCondition();
1252 4527 : if (ConstantInt *SimpleCond
1253 13581 : = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1254 184 : BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0));
1255 92 : continue;
1256 : }
1257 : }
1258 1754 : } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1259 96 : Value *Cond = SI->getCondition();
1260 96 : if (ConstantInt *SimpleCond
1261 288 : = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1262 14 : BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
1263 14 : continue;
1264 : }
1265 : }
1266 :
1267 : // If we're unable to select a particular successor, just count all of
1268 : // them.
1269 32218 : for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1270 : ++TIdx)
1271 12788 : BBWorklist.insert(TI->getSuccessor(TIdx));
1272 :
1273 : // If we had any successors at this point, than post-inlining is likely to
1274 : // have them as well. Note that we assume any basic blocks which existed
1275 : // due to branches or switches which folded above will also fold after
1276 : // inlining.
1277 11394 : if (SingleBB && TI->getNumSuccessors() > 1) {
1278 : // Take off the bonus we applied to the threshold.
1279 601 : Threshold -= SingleBBBonus;
1280 601 : SingleBB = false;
1281 : }
1282 : }
1283 :
1284 : // If this is a noduplicate call, we can still inline as long as
1285 : // inlining this would cause the removal of the caller (so the instruction
1286 : // is not actually duplicated, just moved).
1287 1583 : if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1288 : return false;
1289 :
1290 : // We applied the maximum possible vector bonus at the beginning. Now,
1291 : // subtract the excess bonus, if any, from the Threshold before
1292 : // comparing against Cost.
1293 1578 : if (NumVectorInstructions <= NumInstructions / 10)
1294 1575 : Threshold -= FiftyPercentVectorBonus;
1295 3 : else if (NumVectorInstructions <= NumInstructions / 2)
1296 1 : Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
1297 :
1298 1578 : return Cost < Threshold;
1299 : }
1300 :
1301 : #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1302 : /// \brief Dump stats about this call's analysis.
1303 : void CallAnalyzer::dump() {
1304 : #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n"
1305 : DEBUG_PRINT_STAT(NumConstantArgs);
1306 : DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1307 : DEBUG_PRINT_STAT(NumAllocaArgs);
1308 : DEBUG_PRINT_STAT(NumConstantPtrCmps);
1309 : DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1310 : DEBUG_PRINT_STAT(NumInstructionsSimplified);
1311 : DEBUG_PRINT_STAT(NumInstructions);
1312 : DEBUG_PRINT_STAT(SROACostSavings);
1313 : DEBUG_PRINT_STAT(SROACostSavingsLost);
1314 : DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1315 : DEBUG_PRINT_STAT(Cost);
1316 : DEBUG_PRINT_STAT(Threshold);
1317 : #undef DEBUG_PRINT_STAT
1318 : }
1319 : #endif
1320 :
1321 8236 : INITIALIZE_PASS_BEGIN(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
1322 : true, true)
1323 8236 : INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1324 8236 : INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1325 28609 : INITIALIZE_PASS_END(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
1326 : true, true)
1327 :
1328 : char InlineCostAnalysis::ID = 0;
1329 :
1330 9070 : InlineCostAnalysis::InlineCostAnalysis() : CallGraphSCCPass(ID) {}
1331 :
1332 13563 : InlineCostAnalysis::~InlineCostAnalysis() {}
1333 :
1334 4535 : void InlineCostAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
1335 9070 : AU.setPreservesAll();
1336 4535 : AU.addRequired<AssumptionCacheTracker>();
1337 : AU.addRequired<TargetTransformInfoWrapperPass>();
1338 4535 : CallGraphSCCPass::getAnalysisUsage(AU);
1339 4535 : }
1340 :
1341 76450 : bool InlineCostAnalysis::runOnSCC(CallGraphSCC &SCC) {
1342 152900 : TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
1343 152900 : ACT = &getAnalysis<AssumptionCacheTracker>();
1344 76450 : return false;
1345 : }
1346 :
1347 6822 : InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, int Threshold) {
1348 6822 : return getInlineCost(CS, CS.getCalledFunction(), Threshold);
1349 : }
1350 :
1351 : /// \brief Test that two functions either have or have not the given attribute
1352 : /// at the same time.
1353 : template<typename AttrKind>
1354 4926 : static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
1355 14778 : return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
1356 : }
1357 :
1358 : /// \brief Test that there are no attribute conflicts between Caller and Callee
1359 : /// that prevent inlining.
1360 1649 : static bool functionsHaveCompatibleAttributes(Function *Caller,
1361 : Function *Callee,
1362 : TargetTransformInfo &TTI) {
1363 3295 : return TTI.areInlineCompatible(Caller, Callee) &&
1364 3288 : attributeMatches(Caller, Callee, Attribute::SanitizeAddress) &&
1365 4929 : attributeMatches(Caller, Callee, Attribute::SanitizeMemory) &&
1366 3287 : attributeMatches(Caller, Callee, Attribute::SanitizeThread);
1367 : }
1368 :
1369 6822 : InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
1370 : int Threshold) {
1371 : // Cannot inline indirect calls.
1372 6822 : if (!Callee)
1373 : return llvm::InlineCost::getNever();
1374 :
1375 : // Calls to functions with always-inline attributes should be inlined
1376 : // whenever possible.
1377 6822 : if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1378 5173 : if (isInlineViable(*Callee))
1379 : return llvm::InlineCost::getAlways();
1380 : return llvm::InlineCost::getNever();
1381 : }
1382 :
1383 : // Never inline functions with conflicting attributes (unless callee has
1384 : // always-inline attribute).
1385 1649 : if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee,
1386 1649 : TTIWP->getTTI(*Callee)))
1387 : return llvm::InlineCost::getNever();
1388 :
1389 : // Don't inline this call if the caller has the optnone attribute.
1390 3268 : if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
1391 : return llvm::InlineCost::getNever();
1392 :
1393 : // Don't inline functions which can be redefined at link-time to mean
1394 : // something else. Don't inline functions marked noinline or call sites
1395 : // marked noinline.
1396 4894 : if (Callee->mayBeOverridden() ||
1397 3245 : Callee->hasFnAttribute(Attribute::NoInline) || CS.isNoInline())
1398 : return llvm::InlineCost::getNever();
1399 :
1400 : DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
1401 : << "...\n");
1402 :
1403 3224 : CallAnalyzer CA(TTIWP->getTTI(*Callee), ACT, *Callee, Threshold, CS);
1404 1612 : bool ShouldInline = CA.analyzeCall(CS);
1405 :
1406 : DEBUG(CA.dump());
1407 :
1408 : // Check if there was a reason to force inlining or no inlining.
1409 1612 : if (!ShouldInline && CA.getCost() < CA.getThreshold())
1410 : return InlineCost::getNever();
1411 1572 : if (ShouldInline && CA.getCost() >= CA.getThreshold())
1412 : return InlineCost::getAlways();
1413 :
1414 1572 : return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
1415 : }
1416 :
1417 15790 : bool InlineCostAnalysis::isInlineViable(Function &F) {
1418 15790 : bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
1419 31580 : for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
1420 : // Disallow inlining of functions which contain indirect branches or
1421 : // blockaddresses.
1422 48036 : if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
1423 : return false;
1424 :
1425 48030 : for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
1426 : ++II) {
1427 371298 : CallSite CS(II);
1428 185649 : if (!CS)
1429 : continue;
1430 :
1431 : // Disallow recursive calls.
1432 13325 : if (&F == CS.getCalledFunction())
1433 : return false;
1434 :
1435 : // Disallow calls which expose returns-twice to a function not previously
1436 : // attributed as such.
1437 39933 : if (!ReturnsTwice && CS.isCall() &&
1438 26596 : cast<CallInst>(CS.getInstruction())->canReturnTwice())
1439 : return false;
1440 :
1441 : // Disallow inlining functions that call @llvm.localescape. Doing this
1442 : // correctly would require major changes to the inliner.
1443 26616 : if (CS.getCalledFunction() &&
1444 : CS.getCalledFunction()->getIntrinsicID() ==
1445 : llvm::Intrinsic::localescape)
1446 : return false;
1447 : }
1448 : }
1449 :
1450 : return true;
1451 : }
|