LLVM 18.0.0git
InlineCost.cpp
Go to the documentation of this file.
1//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements inline cost analysis.
10//
11//===----------------------------------------------------------------------===//
12
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SetVector.h"
18#include "llvm/ADT/Statistic.h"
31#include "llvm/Config/llvm-config.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/DataLayout.h"
35#include "llvm/IR/Dominators.h"
37#include "llvm/IR/GlobalAlias.h"
38#include "llvm/IR/InstVisitor.h"
40#include "llvm/IR/Operator.h"
43#include "llvm/Support/Debug.h"
46#include <climits>
47#include <limits>
48#include <optional>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "inline-cost"
53
54STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
55
56static cl::opt<int>
57 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
58 cl::desc("Default amount of inlining to perform"));
59
60// We introduce this option since there is a minor compile-time win by avoiding
61// addition of TTI attributes (target-features in particular) to inline
62// candidates when they are guaranteed to be the same as top level methods in
63// some use cases. If we avoid adding the attribute, we need an option to avoid
64// checking these attributes.
66 "ignore-tti-inline-compatible", cl::Hidden, cl::init(false),
67 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
68 "during inline cost calculation"));
69
71 "print-instruction-comments", cl::Hidden, cl::init(false),
72 cl::desc("Prints comments for instruction based on inline cost analysis"));
73
75 "inline-threshold", cl::Hidden, cl::init(225),
76 cl::desc("Control the amount of inlining to perform (default = 225)"));
77
79 "inlinehint-threshold", cl::Hidden, cl::init(325),
80 cl::desc("Threshold for inlining functions with inline hint"));
81
82static cl::opt<int>
83 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
84 cl::init(45),
85 cl::desc("Threshold for inlining cold callsites"));
86
88 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
89 cl::desc("Enable the cost-benefit analysis for the inliner"));
90
92 "inline-savings-multiplier", cl::Hidden, cl::init(8),
93 cl::desc("Multiplier to multiply cycle savings by during inlining"));
94
95static cl::opt<int>
96 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
97 cl::desc("The maximum size of a callee that get's "
98 "inlined without sufficient cycle savings"));
99
100// We introduce this threshold to help performance of instrumentation based
101// PGO before we actually hook up inliner with analysis passes such as BPI and
102// BFI.
104 "inlinecold-threshold", cl::Hidden, cl::init(45),
105 cl::desc("Threshold for inlining functions with cold attribute"));
106
107static cl::opt<int>
108 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
109 cl::desc("Threshold for hot callsites "));
110
112 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525),
113 cl::desc("Threshold for locally hot callsites "));
114
116 "cold-callsite-rel-freq", cl::Hidden, cl::init(2),
117 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
118 "entry frequency, for a callsite to be cold in the absence of "
119 "profile information."));
120
122 "hot-callsite-rel-freq", cl::Hidden, cl::init(60),
123 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
124 "entry frequency, for a callsite to be hot in the absence of "
125 "profile information."));
126
127static cl::opt<int>
128 InstrCost("inline-instr-cost", cl::Hidden, cl::init(5),
129 cl::desc("Cost of a single instruction when inlining"));
130
131static cl::opt<int>
132 MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0),
133 cl::desc("Cost of load/store instruction when inlining"));
134
136 "inline-call-penalty", cl::Hidden, cl::init(25),
137 cl::desc("Call penalty that is applied per callsite when inlining"));
138
139static cl::opt<size_t>
140 StackSizeThreshold("inline-max-stacksize", cl::Hidden,
141 cl::init(std::numeric_limits<size_t>::max()),
142 cl::desc("Do not inline functions with a stack size "
143 "that exceeds the specified limit"));
144
146 "recursive-inline-max-stacksize", cl::Hidden,
148 cl::desc("Do not inline recursive functions with a stack "
149 "size that exceeds the specified limit"));
150
152 "inline-cost-full", cl::Hidden,
153 cl::desc("Compute the full inline cost of a call site even when the cost "
154 "exceeds the threshold."));
155
157 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
158 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
159 "attributes."));
160
162 "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
163 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
164
165namespace llvm {
166std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
167 if (Attr.isValid()) {
168 int AttrValue = 0;
169 if (!Attr.getValueAsString().getAsInteger(10, AttrValue))
170 return AttrValue;
171 }
172 return std::nullopt;
173}
174
175std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
176 return getStringFnAttrAsInt(CB.getFnAttr(AttrKind));
177}
178
179std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
180 return getStringFnAttrAsInt(F->getFnAttribute(AttrKind));
181}
182
183namespace InlineConstants {
184int getInstrCost() { return InstrCost; }
185
186} // namespace InlineConstants
187
188} // namespace llvm
189
190namespace {
191class InlineCostCallAnalyzer;
192
193// This struct is used to store information about inline cost of a
194// particular instruction
195struct InstructionCostDetail {
196 int CostBefore = 0;
197 int CostAfter = 0;
198 int ThresholdBefore = 0;
199 int ThresholdAfter = 0;
200
201 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
202
203 int getCostDelta() const { return CostAfter - CostBefore; }
204
205 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
206};
207
208class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
209private:
210 InlineCostCallAnalyzer *const ICCA;
211
212public:
213 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
215 formatted_raw_ostream &OS) override;
216};
217
218/// Carry out call site analysis, in order to evaluate inlinability.
219/// NOTE: the type is currently used as implementation detail of functions such
220/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
221/// expectation is that they come from the outer scope, from the wrapper
222/// functions. If we want to support constructing CallAnalyzer objects where
223/// lambdas are provided inline at construction, or where the object needs to
224/// otherwise survive past the scope of the provided functions, we need to
225/// revisit the argument types.
226class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
228 friend class InstVisitor<CallAnalyzer, bool>;
229
230protected:
231 virtual ~CallAnalyzer() = default;
232 /// The TargetTransformInfo available for this compilation.
234
235 /// Getter for the cache of @llvm.assume intrinsics.
236 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
237
238 /// Getter for BlockFrequencyInfo
240
241 /// Profile summary information.
243
244 /// The called function.
245 Function &F;
246
247 // Cache the DataLayout since we use it a lot.
248 const DataLayout &DL;
249
250 /// The OptimizationRemarkEmitter available for this compilation.
252
253 /// The candidate callsite being analyzed. Please do not use this to do
254 /// analysis in the caller function; we want the inline cost query to be
255 /// easily cacheable. Instead, use the cover function paramHasAttr.
256 CallBase &CandidateCall;
257
258 /// Extension points for handling callsite features.
259 // Called before a basic block was analyzed.
260 virtual void onBlockStart(const BasicBlock *BB) {}
261
262 /// Called after a basic block was analyzed.
263 virtual void onBlockAnalyzed(const BasicBlock *BB) {}
264
265 /// Called before an instruction was analyzed
266 virtual void onInstructionAnalysisStart(const Instruction *I) {}
267
268 /// Called after an instruction was analyzed
269 virtual void onInstructionAnalysisFinish(const Instruction *I) {}
270
271 /// Called at the end of the analysis of the callsite. Return the outcome of
272 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
273 /// the reason it can't.
274 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
275 /// Called when we're about to start processing a basic block, and every time
276 /// we are done processing an instruction. Return true if there is no point in
277 /// continuing the analysis (e.g. we've determined already the call site is
278 /// too expensive to inline)
279 virtual bool shouldStop() { return false; }
280
281 /// Called before the analysis of the callee body starts (with callsite
282 /// contexts propagated). It checks callsite-specific information. Return a
283 /// reason analysis can't continue if that's the case, or 'true' if it may
284 /// continue.
285 virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
286 /// Called if the analysis engine decides SROA cannot be done for the given
287 /// alloca.
288 virtual void onDisableSROA(AllocaInst *Arg) {}
289
290 /// Called the analysis engine determines load elimination won't happen.
291 virtual void onDisableLoadElimination() {}
292
293 /// Called when we visit a CallBase, before the analysis starts. Return false
294 /// to stop further processing of the instruction.
295 virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
296
297 /// Called to account for a call.
298 virtual void onCallPenalty() {}
299
300 /// Called to account for a load or store.
301 virtual void onMemAccess(){};
302
303 /// Called to account for the expectation the inlining would result in a load
304 /// elimination.
305 virtual void onLoadEliminationOpportunity() {}
306
307 /// Called to account for the cost of argument setup for the Call in the
308 /// callee's body (not the callsite currently under analysis).
309 virtual void onCallArgumentSetup(const CallBase &Call) {}
310
311 /// Called to account for a load relative intrinsic.
312 virtual void onLoadRelativeIntrinsic() {}
313
314 /// Called to account for a lowered call.
315 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
316 }
317
318 /// Account for a jump table of given size. Return false to stop further
319 /// processing the switch instruction
320 virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
321
322 /// Account for a case cluster of given size. Return false to stop further
323 /// processing of the instruction.
324 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
325
326 /// Called at the end of processing a switch instruction, with the given
327 /// number of case clusters.
328 virtual void onFinalizeSwitch(unsigned JumpTableSize,
329 unsigned NumCaseCluster) {}
330
331 /// Called to account for any other instruction not specifically accounted
332 /// for.
333 virtual void onMissedSimplification() {}
334
335 /// Start accounting potential benefits due to SROA for the given alloca.
336 virtual void onInitializeSROAArg(AllocaInst *Arg) {}
337
338 /// Account SROA savings for the AllocaInst value.
339 virtual void onAggregateSROAUse(AllocaInst *V) {}
340
341 bool handleSROA(Value *V, bool DoNotDisable) {
342 // Check for SROA candidates in comparisons.
343 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
344 if (DoNotDisable) {
345 onAggregateSROAUse(SROAArg);
346 return true;
347 }
348 disableSROAForArg(SROAArg);
349 }
350 return false;
351 }
352
353 bool IsCallerRecursive = false;
354 bool IsRecursiveCall = false;
355 bool ExposesReturnsTwice = false;
356 bool HasDynamicAlloca = false;
357 bool ContainsNoDuplicateCall = false;
358 bool HasReturn = false;
359 bool HasIndirectBr = false;
360 bool HasUninlineableIntrinsic = false;
361 bool InitsVargArgs = false;
362
363 /// Number of bytes allocated statically by the callee.
364 uint64_t AllocatedSize = 0;
365 unsigned NumInstructions = 0;
366 unsigned NumVectorInstructions = 0;
367
368 /// While we walk the potentially-inlined instructions, we build up and
369 /// maintain a mapping of simplified values specific to this callsite. The
370 /// idea is to propagate any special information we have about arguments to
371 /// this call through the inlinable section of the function, and account for
372 /// likely simplifications post-inlining. The most important aspect we track
373 /// is CFG altering simplifications -- when we prove a basic block dead, that
374 /// can cause dramatic shifts in the cost of inlining a function.
375 DenseMap<Value *, Constant *> SimplifiedValues;
376
377 /// Keep track of the values which map back (through function arguments) to
378 /// allocas on the caller stack which could be simplified through SROA.
380
381 /// Keep track of Allocas for which we believe we may get SROA optimization.
382 DenseSet<AllocaInst *> EnabledSROAAllocas;
383
384 /// Keep track of values which map to a pointer base and constant offset.
386
387 /// Keep track of dead blocks due to the constant arguments.
389
390 /// The mapping of the blocks to their known unique successors due to the
391 /// constant arguments.
393
394 /// Model the elimination of repeated loads that is expected to happen
395 /// whenever we simplify away the stores that would otherwise cause them to be
396 /// loads.
397 bool EnableLoadElimination = true;
398
399 /// Whether we allow inlining for recursive call.
400 bool AllowRecursiveCall = false;
401
402 SmallPtrSet<Value *, 16> LoadAddrSet;
403
404 AllocaInst *getSROAArgForValueOrNull(Value *V) const {
405 auto It = SROAArgValues.find(V);
406 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
407 return nullptr;
408 return It->second;
409 }
410
411 // Custom simplification helper routines.
412 bool isAllocaDerivedArg(Value *V);
413 void disableSROAForArg(AllocaInst *SROAArg);
414 void disableSROA(Value *V);
415 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
416 void disableLoadElimination();
417 bool isGEPFree(GetElementPtrInst &GEP);
418 bool canFoldInboundsGEP(GetElementPtrInst &I);
419 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
420 bool simplifyCallSite(Function *F, CallBase &Call);
422 bool simplifyIntrinsicCallIsConstant(CallBase &CB);
423 bool simplifyIntrinsicCallObjectSize(CallBase &CB);
424 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
425
426 /// Return true if the given argument to the function being considered for
427 /// inlining has the given attribute set either at the call site or the
428 /// function declaration. Primarily used to inspect call site specific
429 /// attributes since these can be more precise than the ones on the callee
430 /// itself.
431 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
432
433 /// Return true if the given value is known non null within the callee if
434 /// inlined through this particular callsite.
435 bool isKnownNonNullInCallee(Value *V);
436
437 /// Return true if size growth is allowed when inlining the callee at \p Call.
438 bool allowSizeGrowth(CallBase &Call);
439
440 // Custom analysis routines.
441 InlineResult analyzeBlock(BasicBlock *BB,
443
444 // Disable several entry points to the visitor so we don't accidentally use
445 // them by declaring but not defining them here.
446 void visit(Module *);
447 void visit(Module &);
448 void visit(Function *);
449 void visit(Function &);
450 void visit(BasicBlock *);
451 void visit(BasicBlock &);
452
453 // Provide base case for our instruction visit.
455
456 // Our visit overrides.
457 bool visitAlloca(AllocaInst &I);
458 bool visitPHI(PHINode &I);
459 bool visitGetElementPtr(GetElementPtrInst &I);
460 bool visitBitCast(BitCastInst &I);
461 bool visitPtrToInt(PtrToIntInst &I);
462 bool visitIntToPtr(IntToPtrInst &I);
463 bool visitCastInst(CastInst &I);
464 bool visitCmpInst(CmpInst &I);
465 bool visitSub(BinaryOperator &I);
467 bool visitFNeg(UnaryOperator &I);
468 bool visitLoad(LoadInst &I);
469 bool visitStore(StoreInst &I);
470 bool visitExtractValue(ExtractValueInst &I);
471 bool visitInsertValue(InsertValueInst &I);
472 bool visitCallBase(CallBase &Call);
473 bool visitReturnInst(ReturnInst &RI);
474 bool visitBranchInst(BranchInst &BI);
475 bool visitSelectInst(SelectInst &SI);
476 bool visitSwitchInst(SwitchInst &SI);
478 bool visitResumeInst(ResumeInst &RI);
482
483public:
484 CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
485 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
486 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
487 ProfileSummaryInfo *PSI = nullptr,
488 OptimizationRemarkEmitter *ORE = nullptr)
489 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
490 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
491 CandidateCall(Call) {}
492
493 InlineResult analyze();
494
495 std::optional<Constant *> getSimplifiedValue(Instruction *I) {
496 if (SimplifiedValues.contains(I))
497 return SimplifiedValues[I];
498 return std::nullopt;
499 }
500
501 // Keep a bunch of stats about the cost savings found so we can print them
502 // out when debugging.
503 unsigned NumConstantArgs = 0;
504 unsigned NumConstantOffsetPtrArgs = 0;
505 unsigned NumAllocaArgs = 0;
506 unsigned NumConstantPtrCmps = 0;
507 unsigned NumConstantPtrDiffs = 0;
508 unsigned NumInstructionsSimplified = 0;
509
510 void dump();
511};
512
513// Considering forming a binary search, we should find the number of nodes
514// which is same as the number of comparisons when lowered. For a given
515// number of clusters, n, we can define a recursive function, f(n), to find
516// the number of nodes in the tree. The recursion is :
517// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
518// and f(n) = n, when n <= 3.
519// This will lead a binary tree where the leaf should be either f(2) or f(3)
520// when n > 3. So, the number of comparisons from leaves should be n, while
521// the number of non-leaf should be :
522// 2^(log2(n) - 1) - 1
523// = 2^log2(n) * 2^-1 - 1
524// = n / 2 - 1.
525// Considering comparisons from leaf and non-leaf nodes, we can estimate the
526// number of comparisons in a simple closed form :
527// n + n / 2 - 1 = n * 3 / 2 - 1
528int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
529 return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
530}
531
532/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
533/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
534class InlineCostCallAnalyzer final : public CallAnalyzer {
535 const bool ComputeFullInlineCost;
536 int LoadEliminationCost = 0;
537 /// Bonus to be applied when percentage of vector instructions in callee is
538 /// high (see more details in updateThreshold).
539 int VectorBonus = 0;
540 /// Bonus to be applied when the callee has only one reachable basic block.
541 int SingleBBBonus = 0;
542
543 /// Tunable parameters that control the analysis.
544 const InlineParams &Params;
545
546 // This DenseMap stores the delta change in cost and threshold after
547 // accounting for the given instruction. The map is filled only with the
548 // flag PrintInstructionComments on.
550
551 /// Upper bound for the inlining cost. Bonuses are being applied to account
552 /// for speculative "expected profit" of the inlining decision.
553 int Threshold = 0;
554
555 /// The amount of StaticBonus applied.
556 int StaticBonusApplied = 0;
557
558 /// Attempt to evaluate indirect calls to boost its inline cost.
559 const bool BoostIndirectCalls;
560
561 /// Ignore the threshold when finalizing analysis.
562 const bool IgnoreThreshold;
563
564 // True if the cost-benefit-analysis-based inliner is enabled.
565 const bool CostBenefitAnalysisEnabled;
566
567 /// Inlining cost measured in abstract units, accounts for all the
568 /// instructions expected to be executed for a given function invocation.
569 /// Instructions that are statically proven to be dead based on call-site
570 /// arguments are not counted here.
571 int Cost = 0;
572
573 // The cumulative cost at the beginning of the basic block being analyzed. At
574 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
575 // the size of that basic block.
576 int CostAtBBStart = 0;
577
578 // The static size of live but cold basic blocks. This is "static" in the
579 // sense that it's not weighted by profile counts at all.
580 int ColdSize = 0;
581
582 // Whether inlining is decided by cost-threshold analysis.
583 bool DecidedByCostThreshold = false;
584
585 // Whether inlining is decided by cost-benefit analysis.
586 bool DecidedByCostBenefit = false;
587
588 // The cost-benefit pair computed by cost-benefit analysis.
589 std::optional<CostBenefitPair> CostBenefit;
590
591 bool SingleBB = true;
592
593 unsigned SROACostSavings = 0;
594 unsigned SROACostSavingsLost = 0;
595
596 /// The mapping of caller Alloca values to their accumulated cost savings. If
597 /// we have to disable SROA for one of the allocas, this tells us how much
598 /// cost must be added.
599 DenseMap<AllocaInst *, int> SROAArgCosts;
600
601 /// Return true if \p Call is a cold callsite.
602 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
603
604 /// Update Threshold based on callsite properties such as callee
605 /// attributes and callee hotness for PGO builds. The Callee is explicitly
606 /// passed to support analyzing indirect calls whose target is inferred by
607 /// analysis.
608 void updateThreshold(CallBase &Call, Function &Callee);
609 /// Return a higher threshold if \p Call is a hot callsite.
610 std::optional<int> getHotCallSiteThreshold(CallBase &Call,
611 BlockFrequencyInfo *CallerBFI);
612
613 /// Handle a capped 'int' increment for Cost.
614 void addCost(int64_t Inc) {
615 Inc = std::max<int64_t>(std::min<int64_t>(INT_MAX, Inc), INT_MIN);
616 Cost = std::max<int64_t>(std::min<int64_t>(INT_MAX, Inc + Cost), INT_MIN);
617 }
618
619 void onDisableSROA(AllocaInst *Arg) override {
620 auto CostIt = SROAArgCosts.find(Arg);
621 if (CostIt == SROAArgCosts.end())
622 return;
623 addCost(CostIt->second);
624 SROACostSavings -= CostIt->second;
625 SROACostSavingsLost += CostIt->second;
626 SROAArgCosts.erase(CostIt);
627 }
628
629 void onDisableLoadElimination() override {
630 addCost(LoadEliminationCost);
631 LoadEliminationCost = 0;
632 }
633
634 bool onCallBaseVisitStart(CallBase &Call) override {
635 if (std::optional<int> AttrCallThresholdBonus =
636 getStringFnAttrAsInt(Call, "call-threshold-bonus"))
637 Threshold += *AttrCallThresholdBonus;
638
639 if (std::optional<int> AttrCallCost =
640 getStringFnAttrAsInt(Call, "call-inline-cost")) {
641 addCost(*AttrCallCost);
642 // Prevent further processing of the call since we want to override its
643 // inline cost, not just add to it.
644 return false;
645 }
646 return true;
647 }
648
649 void onCallPenalty() override { addCost(CallPenalty); }
650
651 void onMemAccess() override { addCost(MemAccessCost); }
652
653 void onCallArgumentSetup(const CallBase &Call) override {
654 // Pay the price of the argument setup. We account for the average 1
655 // instruction per call argument setup here.
656 addCost(Call.arg_size() * InstrCost);
657 }
658 void onLoadRelativeIntrinsic() override {
659 // This is normally lowered to 4 LLVM instructions.
660 addCost(3 * InstrCost);
661 }
662 void onLoweredCall(Function *F, CallBase &Call,
663 bool IsIndirectCall) override {
664 // We account for the average 1 instruction per call argument setup here.
665 addCost(Call.arg_size() * InstrCost);
666
667 // If we have a constant that we are calling as a function, we can peer
668 // through it and see the function target. This happens not infrequently
669 // during devirtualization and so we want to give it a hefty bonus for
670 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
671 // Pretend to inline the function, with a custom threshold.
672 if (IsIndirectCall && BoostIndirectCalls) {
673 auto IndirectCallParams = Params;
674 IndirectCallParams.DefaultThreshold =
676 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
677 /// to instantiate the derived class.
678 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
679 GetAssumptionCache, GetBFI, PSI, ORE, false);
680 if (CA.analyze().isSuccess()) {
681 // We were able to inline the indirect call! Subtract the cost from the
682 // threshold to get the bonus we want to apply, but don't go below zero.
683 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
684 }
685 } else
686 // Otherwise simply add the cost for merely making the call.
687 addCost(CallPenalty);
688 }
689
690 void onFinalizeSwitch(unsigned JumpTableSize,
691 unsigned NumCaseCluster) override {
692 // If suitable for a jump table, consider the cost for the table size and
693 // branch to destination.
694 // Maximum valid cost increased in this function.
695 if (JumpTableSize) {
696 int64_t JTCost =
697 static_cast<int64_t>(JumpTableSize) * InstrCost + 4 * InstrCost;
698
699 addCost(JTCost);
700 return;
701 }
702
703 if (NumCaseCluster <= 3) {
704 // Suppose a comparison includes one compare and one conditional branch.
705 addCost(NumCaseCluster * 2 * InstrCost);
706 return;
707 }
708
709 int64_t ExpectedNumberOfCompare =
710 getExpectedNumberOfCompare(NumCaseCluster);
711 int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
712
713 addCost(SwitchCost);
714 }
715 void onMissedSimplification() override { addCost(InstrCost); }
716
717 void onInitializeSROAArg(AllocaInst *Arg) override {
718 assert(Arg != nullptr &&
719 "Should not initialize SROA costs for null value.");
720 auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
721 SROACostSavings += SROAArgCost;
722 SROAArgCosts[Arg] = SROAArgCost;
723 }
724
725 void onAggregateSROAUse(AllocaInst *SROAArg) override {
726 auto CostIt = SROAArgCosts.find(SROAArg);
727 assert(CostIt != SROAArgCosts.end() &&
728 "expected this argument to have a cost");
729 CostIt->second += InstrCost;
730 SROACostSavings += InstrCost;
731 }
732
733 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
734
735 void onBlockAnalyzed(const BasicBlock *BB) override {
736 if (CostBenefitAnalysisEnabled) {
737 // Keep track of the static size of live but cold basic blocks. For now,
738 // we define a cold basic block to be one that's never executed.
739 assert(GetBFI && "GetBFI must be available");
740 BlockFrequencyInfo *BFI = &(GetBFI(F));
741 assert(BFI && "BFI must be available");
742 auto ProfileCount = BFI->getBlockProfileCount(BB);
743 if (*ProfileCount == 0)
744 ColdSize += Cost - CostAtBBStart;
745 }
746
747 auto *TI = BB->getTerminator();
748 // If we had any successors at this point, than post-inlining is likely to
749 // have them as well. Note that we assume any basic blocks which existed
750 // due to branches or switches which folded above will also fold after
751 // inlining.
752 if (SingleBB && TI->getNumSuccessors() > 1) {
753 // Take off the bonus we applied to the threshold.
754 Threshold -= SingleBBBonus;
755 SingleBB = false;
756 }
757 }
758
759 void onInstructionAnalysisStart(const Instruction *I) override {
760 // This function is called to store the initial cost of inlining before
761 // the given instruction was assessed.
763 return;
764 InstructionCostDetailMap[I].CostBefore = Cost;
765 InstructionCostDetailMap[I].ThresholdBefore = Threshold;
766 }
767
768 void onInstructionAnalysisFinish(const Instruction *I) override {
769 // This function is called to find new values of cost and threshold after
770 // the instruction has been assessed.
772 return;
773 InstructionCostDetailMap[I].CostAfter = Cost;
774 InstructionCostDetailMap[I].ThresholdAfter = Threshold;
775 }
776
777 bool isCostBenefitAnalysisEnabled() {
778 if (!PSI || !PSI->hasProfileSummary())
779 return false;
780
781 if (!GetBFI)
782 return false;
783
784 if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
785 // Honor the explicit request from the user.
787 return false;
788 } else {
789 // Otherwise, require instrumentation profile.
790 if (!PSI->hasInstrumentationProfile())
791 return false;
792 }
793
794 auto *Caller = CandidateCall.getParent()->getParent();
795 if (!Caller->getEntryCount())
796 return false;
797
798 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
799 if (!CallerBFI)
800 return false;
801
802 // For now, limit to hot call site.
803 if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
804 return false;
805
806 // Make sure we have a nonzero entry count.
807 auto EntryCount = F.getEntryCount();
808 if (!EntryCount || !EntryCount->getCount())
809 return false;
810
811 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
812 if (!CalleeBFI)
813 return false;
814
815 return true;
816 }
817
818 // Determine whether we should inline the given call site, taking into account
819 // both the size cost and the cycle savings. Return std::nullopt if we don't
820 // have sufficient profiling information to determine.
821 std::optional<bool> costBenefitAnalysis() {
822 if (!CostBenefitAnalysisEnabled)
823 return std::nullopt;
824
825 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
826 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
827 // falling back to the cost-based metric.
828 // TODO: Improve this hacky condition.
829 if (Threshold == 0)
830 return std::nullopt;
831
832 assert(GetBFI);
833 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
834 assert(CalleeBFI);
835
836 // The cycle savings expressed as the sum of InstrCost
837 // multiplied by the estimated dynamic count of each instruction we can
838 // avoid. Savings come from the call site cost, such as argument setup and
839 // the call instruction, as well as the instructions that are folded.
840 //
841 // We use 128-bit APInt here to avoid potential overflow. This variable
842 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
843 // assumes that we can avoid or fold a billion instructions, each with a
844 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
845 // period on a 4GHz machine.
846 APInt CycleSavings(128, 0);
847
848 for (auto &BB : F) {
849 APInt CurrentSavings(128, 0);
850 for (auto &I : BB) {
851 if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
852 // Count a conditional branch as savings if it becomes unconditional.
853 if (BI->isConditional() &&
854 isa_and_nonnull<ConstantInt>(
855 SimplifiedValues.lookup(BI->getCondition()))) {
856 CurrentSavings += InstrCost;
857 }
858 } else if (Value *V = dyn_cast<Value>(&I)) {
859 // Count an instruction as savings if we can fold it.
860 if (SimplifiedValues.count(V)) {
861 CurrentSavings += InstrCost;
862 }
863 }
864 }
865
866 auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
867 CurrentSavings *= *ProfileCount;
868 CycleSavings += CurrentSavings;
869 }
870
871 // Compute the cycle savings per call.
872 auto EntryProfileCount = F.getEntryCount();
873 assert(EntryProfileCount && EntryProfileCount->getCount());
874 auto EntryCount = EntryProfileCount->getCount();
875 CycleSavings += EntryCount / 2;
876 CycleSavings = CycleSavings.udiv(EntryCount);
877
878 // Compute the total savings for the call site.
879 auto *CallerBB = CandidateCall.getParent();
880 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
881 CycleSavings += getCallsiteCost(this->CandidateCall, DL);
882 CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
883
884 // Remove the cost of the cold basic blocks.
885 int Size = Cost - ColdSize;
886
887 // Allow tiny callees to be inlined regardless of whether they meet the
888 // savings threshold.
890
891 CostBenefit.emplace(APInt(128, Size), CycleSavings);
892
893 // Return true if the savings justify the cost of inlining. Specifically,
894 // we evaluate the following inequality:
895 //
896 // CycleSavings PSI->getOrCompHotCountThreshold()
897 // -------------- >= -----------------------------------
898 // Size InlineSavingsMultiplier
899 //
900 // Note that the left hand side is specific to a call site. The right hand
901 // side is a constant for the entire executable.
902 APInt LHS = CycleSavings;
904 APInt RHS(128, PSI->getOrCompHotCountThreshold());
905 RHS *= Size;
906 return LHS.uge(RHS);
907 }
908
909 InlineResult finalizeAnalysis() override {
910 // Loops generally act a lot like calls in that they act like barriers to
911 // movement, require a certain amount of setup, etc. So when optimising for
912 // size, we penalise any call sites that perform loops. We do this after all
913 // other costs here, so will likely only be dealing with relatively small
914 // functions (and hence DT and LI will hopefully be cheap).
915 auto *Caller = CandidateCall.getFunction();
916 if (Caller->hasMinSize()) {
917 DominatorTree DT(F);
918 LoopInfo LI(DT);
919 int NumLoops = 0;
920 for (Loop *L : LI) {
921 // Ignore loops that will not be executed
922 if (DeadBlocks.count(L->getHeader()))
923 continue;
924 NumLoops++;
925 }
926 addCost(NumLoops * InlineConstants::LoopPenalty);
927 }
928
929 // We applied the maximum possible vector bonus at the beginning. Now,
930 // subtract the excess bonus, if any, from the Threshold before
931 // comparing against Cost.
932 if (NumVectorInstructions <= NumInstructions / 10)
933 Threshold -= VectorBonus;
934 else if (NumVectorInstructions <= NumInstructions / 2)
935 Threshold -= VectorBonus / 2;
936
937 if (std::optional<int> AttrCost =
938 getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
939 Cost = *AttrCost;
940
941 if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
942 CandidateCall,
944 Cost *= *AttrCostMult;
945
946 if (std::optional<int> AttrThreshold =
947 getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
948 Threshold = *AttrThreshold;
949
950 if (auto Result = costBenefitAnalysis()) {
951 DecidedByCostBenefit = true;
952 if (*Result)
953 return InlineResult::success();
954 else
955 return InlineResult::failure("Cost over threshold.");
956 }
957
958 if (IgnoreThreshold)
959 return InlineResult::success();
960
961 DecidedByCostThreshold = true;
962 return Cost < std::max(1, Threshold)
964 : InlineResult::failure("Cost over threshold.");
965 }
966
967 bool shouldStop() override {
968 if (IgnoreThreshold || ComputeFullInlineCost)
969 return false;
970 // Bail out the moment we cross the threshold. This means we'll under-count
971 // the cost, but only when undercounting doesn't matter.
972 if (Cost < Threshold)
973 return false;
974 DecidedByCostThreshold = true;
975 return true;
976 }
977
978 void onLoadEliminationOpportunity() override {
979 LoadEliminationCost += InstrCost;
980 }
981
982 InlineResult onAnalysisStart() override {
983 // Perform some tweaks to the cost and threshold based on the direct
984 // callsite information.
985
986 // We want to more aggressively inline vector-dense kernels, so up the
987 // threshold, and we'll lower it if the % of vector instructions gets too
988 // low. Note that these bonuses are some what arbitrary and evolved over
989 // time by accident as much as because they are principled bonuses.
990 //
991 // FIXME: It would be nice to remove all such bonuses. At least it would be
992 // nice to base the bonus values on something more scientific.
993 assert(NumInstructions == 0);
994 assert(NumVectorInstructions == 0);
995
996 // Update the threshold based on callsite properties
997 updateThreshold(CandidateCall, F);
998
999 // While Threshold depends on commandline options that can take negative
1000 // values, we want to enforce the invariant that the computed threshold and
1001 // bonuses are non-negative.
1002 assert(Threshold >= 0);
1003 assert(SingleBBBonus >= 0);
1004 assert(VectorBonus >= 0);
1005
1006 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1007 // this Threshold any time, and cost cannot decrease, we can stop processing
1008 // the rest of the function body.
1009 Threshold += (SingleBBBonus + VectorBonus);
1010
1011 // Give out bonuses for the callsite, as the instructions setting them up
1012 // will be gone after inlining.
1013 addCost(-getCallsiteCost(this->CandidateCall, DL));
1014
1015 // If this function uses the coldcc calling convention, prefer not to inline
1016 // it.
1017 if (F.getCallingConv() == CallingConv::Cold)
1019
1020 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1021
1022 // Check if we're done. This can happen due to bonuses and penalties.
1023 if (Cost >= Threshold && !ComputeFullInlineCost)
1024 return InlineResult::failure("high cost");
1025
1026 return InlineResult::success();
1027 }
1028
1029public:
1030 InlineCostCallAnalyzer(
1031 Function &Callee, CallBase &Call, const InlineParams &Params,
1032 const TargetTransformInfo &TTI,
1033 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1034 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1035 ProfileSummaryInfo *PSI = nullptr,
1036 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1037 bool IgnoreThreshold = false)
1038 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
1039 ComputeFullInlineCost(OptComputeFullInlineCost ||
1040 Params.ComputeFullInlineCost || ORE ||
1041 isCostBenefitAnalysisEnabled()),
1042 Params(Params), Threshold(Params.DefaultThreshold),
1043 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1044 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1045 Writer(this) {
1046 AllowRecursiveCall = *Params.AllowRecursiveCall;
1047 }
1048
1049 /// Annotation Writer for instruction details
1050 InlineCostAnnotationWriter Writer;
1051
1052 void dump();
1053
1054 // Prints the same analysis as dump(), but its definition is not dependent
1055 // on the build.
1056 void print(raw_ostream &OS);
1057
1058 std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1059 if (InstructionCostDetailMap.contains(I))
1060 return InstructionCostDetailMap[I];
1061 return std::nullopt;
1062 }
1063
1064 virtual ~InlineCostCallAnalyzer() = default;
1065 int getThreshold() const { return Threshold; }
1066 int getCost() const { return Cost; }
1067 int getStaticBonusApplied() const { return StaticBonusApplied; }
1068 std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1069 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1070 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1071};
1072
1073// Return true if CB is the sole call to local function Callee.
1074static bool isSoleCallToLocalFunction(const CallBase &CB,
1075 const Function &Callee) {
1076 return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1077 &Callee == CB.getCalledFunction();
1078}
1079
1080class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1081private:
1083
1084 // FIXME: These constants are taken from the heuristic-based cost visitor.
1085 // These should be removed entirely in a later revision to avoid reliance on
1086 // heuristics in the ML inliner.
1087 static constexpr int JTCostMultiplier = 4;
1088 static constexpr int CaseClusterCostMultiplier = 2;
1089 static constexpr int SwitchCostMultiplier = 2;
1090
1091 // FIXME: These are taken from the heuristic-based cost visitor: we should
1092 // eventually abstract these to the CallAnalyzer to avoid duplication.
1093 unsigned SROACostSavingOpportunities = 0;
1094 int VectorBonus = 0;
1095 int SingleBBBonus = 0;
1096 int Threshold = 5;
1097
1099
1100 void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1101 Cost[static_cast<size_t>(Feature)] += Delta;
1102 }
1103
1104 void set(InlineCostFeatureIndex Feature, int64_t Value) {
1105 Cost[static_cast<size_t>(Feature)] = Value;
1106 }
1107
1108 void onDisableSROA(AllocaInst *Arg) override {
1109 auto CostIt = SROACosts.find(Arg);
1110 if (CostIt == SROACosts.end())
1111 return;
1112
1113 increment(InlineCostFeatureIndex::sroa_losses, CostIt->second);
1114 SROACostSavingOpportunities -= CostIt->second;
1115 SROACosts.erase(CostIt);
1116 }
1117
1118 void onDisableLoadElimination() override {
1119 set(InlineCostFeatureIndex::load_elimination, 1);
1120 }
1121
1122 void onCallPenalty() override {
1123 increment(InlineCostFeatureIndex::call_penalty, CallPenalty);
1124 }
1125
1126 void onCallArgumentSetup(const CallBase &Call) override {
1127 increment(InlineCostFeatureIndex::call_argument_setup,
1128 Call.arg_size() * InstrCost);
1129 }
1130
1131 void onLoadRelativeIntrinsic() override {
1132 increment(InlineCostFeatureIndex::load_relative_intrinsic, 3 * InstrCost);
1133 }
1134
1135 void onLoweredCall(Function *F, CallBase &Call,
1136 bool IsIndirectCall) override {
1137 increment(InlineCostFeatureIndex::lowered_call_arg_setup,
1138 Call.arg_size() * InstrCost);
1139
1140 if (IsIndirectCall) {
1141 InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1142 /*HintThreshold*/ {},
1143 /*ColdThreshold*/ {},
1144 /*OptSizeThreshold*/ {},
1145 /*OptMinSizeThreshold*/ {},
1146 /*HotCallSiteThreshold*/ {},
1147 /*LocallyHotCallSiteThreshold*/ {},
1148 /*ColdCallSiteThreshold*/ {},
1149 /*ComputeFullInlineCost*/ true,
1150 /*EnableDeferral*/ true};
1151 IndirectCallParams.DefaultThreshold =
1153
1154 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1155 GetAssumptionCache, GetBFI, PSI, ORE, false,
1156 true);
1157 if (CA.analyze().isSuccess()) {
1158 increment(InlineCostFeatureIndex::nested_inline_cost_estimate,
1159 CA.getCost());
1160 increment(InlineCostFeatureIndex::nested_inlines, 1);
1161 }
1162 } else {
1163 onCallPenalty();
1164 }
1165 }
1166
1167 void onFinalizeSwitch(unsigned JumpTableSize,
1168 unsigned NumCaseCluster) override {
1169
1170 if (JumpTableSize) {
1171 int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1172 JTCostMultiplier * InstrCost;
1173 increment(InlineCostFeatureIndex::jump_table_penalty, JTCost);
1174 return;
1175 }
1176
1177 if (NumCaseCluster <= 3) {
1178 increment(InlineCostFeatureIndex::case_cluster_penalty,
1179 NumCaseCluster * CaseClusterCostMultiplier * InstrCost);
1180 return;
1181 }
1182
1183 int64_t ExpectedNumberOfCompare =
1184 getExpectedNumberOfCompare(NumCaseCluster);
1185
1186 int64_t SwitchCost =
1187 ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1188 increment(InlineCostFeatureIndex::switch_penalty, SwitchCost);
1189 }
1190
1191 void onMissedSimplification() override {
1192 increment(InlineCostFeatureIndex::unsimplified_common_instructions,
1193 InstrCost);
1194 }
1195
1196 void onInitializeSROAArg(AllocaInst *Arg) override {
1197 auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
1198 SROACosts[Arg] = SROAArgCost;
1199 SROACostSavingOpportunities += SROAArgCost;
1200 }
1201
1202 void onAggregateSROAUse(AllocaInst *Arg) override {
1203 SROACosts.find(Arg)->second += InstrCost;
1204 SROACostSavingOpportunities += InstrCost;
1205 }
1206
1207 void onBlockAnalyzed(const BasicBlock *BB) override {
1208 if (BB->getTerminator()->getNumSuccessors() > 1)
1209 set(InlineCostFeatureIndex::is_multiple_blocks, 1);
1210 Threshold -= SingleBBBonus;
1211 }
1212
1213 InlineResult finalizeAnalysis() override {
1214 auto *Caller = CandidateCall.getFunction();
1215 if (Caller->hasMinSize()) {
1216 DominatorTree DT(F);
1217 LoopInfo LI(DT);
1218 for (Loop *L : LI) {
1219 // Ignore loops that will not be executed
1220 if (DeadBlocks.count(L->getHeader()))
1221 continue;
1222 increment(InlineCostFeatureIndex::num_loops,
1224 }
1225 }
1226 set(InlineCostFeatureIndex::dead_blocks, DeadBlocks.size());
1227 set(InlineCostFeatureIndex::simplified_instructions,
1228 NumInstructionsSimplified);
1229 set(InlineCostFeatureIndex::constant_args, NumConstantArgs);
1230 set(InlineCostFeatureIndex::constant_offset_ptr_args,
1231 NumConstantOffsetPtrArgs);
1232 set(InlineCostFeatureIndex::sroa_savings, SROACostSavingOpportunities);
1233
1234 if (NumVectorInstructions <= NumInstructions / 10)
1235 Threshold -= VectorBonus;
1236 else if (NumVectorInstructions <= NumInstructions / 2)
1237 Threshold -= VectorBonus / 2;
1238
1239 set(InlineCostFeatureIndex::threshold, Threshold);
1240
1241 return InlineResult::success();
1242 }
1243
1244 bool shouldStop() override { return false; }
1245
1246 void onLoadEliminationOpportunity() override {
1247 increment(InlineCostFeatureIndex::load_elimination, 1);
1248 }
1249
1250 InlineResult onAnalysisStart() override {
1251 increment(InlineCostFeatureIndex::callsite_cost,
1252 -1 * getCallsiteCost(this->CandidateCall, DL));
1253
1254 set(InlineCostFeatureIndex::cold_cc_penalty,
1255 (F.getCallingConv() == CallingConv::Cold));
1256
1257 set(InlineCostFeatureIndex::last_call_to_static_bonus,
1258 isSoleCallToLocalFunction(CandidateCall, F));
1259
1260 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1261 // analyzer - instead, we should abstract it to a common method in the
1262 // CallAnalyzer
1263 int SingleBBBonusPercent = 50;
1264 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1265 Threshold += TTI.adjustInliningThreshold(&CandidateCall);
1266 Threshold *= TTI.getInliningThresholdMultiplier();
1267 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1268 VectorBonus = Threshold * VectorBonusPercent / 100;
1269 Threshold += (SingleBBBonus + VectorBonus);
1270
1271 return InlineResult::success();
1272 }
1273
1274public:
1275 InlineCostFeaturesAnalyzer(
1276 const TargetTransformInfo &TTI,
1277 function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1280 CallBase &Call)
1281 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {}
1282
1283 const InlineCostFeatures &features() const { return Cost; }
1284};
1285
1286} // namespace
1287
1288/// Test whether the given value is an Alloca-derived function argument.
1289bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1290 return SROAArgValues.count(V);
1291}
1292
1293void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1294 onDisableSROA(SROAArg);
1295 EnabledSROAAllocas.erase(SROAArg);
1296 disableLoadElimination();
1297}
1298
1299void InlineCostAnnotationWriter::emitInstructionAnnot(
1301 // The cost of inlining of the given instruction is printed always.
1302 // The threshold delta is printed only when it is non-zero. It happens
1303 // when we decided to give a bonus at a particular instruction.
1304 std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1305 if (!Record)
1306 OS << "; No analysis for the instruction";
1307 else {
1308 OS << "; cost before = " << Record->CostBefore
1309 << ", cost after = " << Record->CostAfter
1310 << ", threshold before = " << Record->ThresholdBefore
1311 << ", threshold after = " << Record->ThresholdAfter << ", ";
1312 OS << "cost delta = " << Record->getCostDelta();
1313 if (Record->hasThresholdChanged())
1314 OS << ", threshold delta = " << Record->getThresholdDelta();
1315 }
1316 auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
1317 if (C) {
1318 OS << ", simplified to ";
1319 (*C)->print(OS, true);
1320 }
1321 OS << "\n";
1322}
1323
1324/// If 'V' maps to a SROA candidate, disable SROA for it.
1325void CallAnalyzer::disableSROA(Value *V) {
1326 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1327 disableSROAForArg(SROAArg);
1328 }
1329}
1330
1331void CallAnalyzer::disableLoadElimination() {
1332 if (EnableLoadElimination) {
1333 onDisableLoadElimination();
1334 EnableLoadElimination = false;
1335 }
1336}
1337
1338/// Accumulate a constant GEP offset into an APInt if possible.
1339///
1340/// Returns false if unable to compute the offset for any reason. Respects any
1341/// simplified values known during the analysis of this callsite.
1342bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1343 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
1344 assert(IntPtrWidth == Offset.getBitWidth());
1345
1347 GTI != GTE; ++GTI) {
1348 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
1349 if (!OpC)
1350 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
1351 OpC = dyn_cast<ConstantInt>(SimpleOp);
1352 if (!OpC)
1353 return false;
1354 if (OpC->isZero())
1355 continue;
1356
1357 // Handle a struct index, which adds its field offset to the pointer.
1358 if (StructType *STy = GTI.getStructTypeOrNull()) {
1359 unsigned ElementIdx = OpC->getZExtValue();
1360 const StructLayout *SL = DL.getStructLayout(STy);
1361 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
1362 continue;
1363 }
1364
1365 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
1366 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
1367 }
1368 return true;
1369}
1370
1371/// Use TTI to check whether a GEP is free.
1372///
1373/// Respects any simplified values known during the analysis of this callsite.
1374bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1376 Operands.push_back(GEP.getOperand(0));
1377 for (const Use &Op : GEP.indices())
1378 if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
1379 Operands.push_back(SimpleOp);
1380 else
1381 Operands.push_back(Op);
1385}
1386
1387bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1388 disableSROA(I.getOperand(0));
1389
1390 // Check whether inlining will turn a dynamic alloca into a static
1391 // alloca and handle that case.
1392 if (I.isArrayAllocation()) {
1393 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
1394 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
1395 // Sometimes a dynamic alloca could be converted into a static alloca
1396 // after this constant prop, and become a huge static alloca on an
1397 // unconditional CFG path. Avoid inlining if this is going to happen above
1398 // a threshold.
1399 // FIXME: If the threshold is removed or lowered too much, we could end up
1400 // being too pessimistic and prevent inlining non-problematic code. This
1401 // could result in unintended perf regressions. A better overall strategy
1402 // is needed to track stack usage during inlining.
1403 Type *Ty = I.getAllocatedType();
1404 AllocatedSize = SaturatingMultiplyAdd(
1405 AllocSize->getLimitedValue(),
1406 DL.getTypeAllocSize(Ty).getKnownMinValue(), AllocatedSize);
1408 HasDynamicAlloca = true;
1409 return false;
1410 }
1411 }
1412
1413 // Accumulate the allocated size.
1414 if (I.isStaticAlloca()) {
1415 Type *Ty = I.getAllocatedType();
1416 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinValue(),
1417 AllocatedSize);
1418 }
1419
1420 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1421 // a variety of reasons, and so we would like to not inline them into
1422 // functions which don't currently have a dynamic alloca. This simply
1423 // disables inlining altogether in the presence of a dynamic alloca.
1424 if (!I.isStaticAlloca())
1425 HasDynamicAlloca = true;
1426
1427 return false;
1428}
1429
1430bool CallAnalyzer::visitPHI(PHINode &I) {
1431 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1432 // though we don't want to propagate it's bonuses. The idea is to disable
1433 // SROA if it *might* be used in an inappropriate manner.
1434
1435 // Phi nodes are always zero-cost.
1436 // FIXME: Pointer sizes may differ between different address spaces, so do we
1437 // need to use correct address space in the call to getPointerSizeInBits here?
1438 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1439 // see the ZeroOffset is used as a dummy value, so we can probably use any
1440 // bit width for the ZeroOffset?
1441 APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));
1442 bool CheckSROA = I.getType()->isPointerTy();
1443
1444 // Track the constant or pointer with constant offset we've seen so far.
1445 Constant *FirstC = nullptr;
1446 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1447 Value *FirstV = nullptr;
1448
1449 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1450 BasicBlock *Pred = I.getIncomingBlock(i);
1451 // If the incoming block is dead, skip the incoming block.
1452 if (DeadBlocks.count(Pred))
1453 continue;
1454 // If the parent block of phi is not the known successor of the incoming
1455 // block, skip the incoming block.
1456 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1457 if (KnownSuccessor && KnownSuccessor != I.getParent())
1458 continue;
1459
1460 Value *V = I.getIncomingValue(i);
1461 // If the incoming value is this phi itself, skip the incoming value.
1462 if (&I == V)
1463 continue;
1464
1465 Constant *C = dyn_cast<Constant>(V);
1466 if (!C)
1467 C = SimplifiedValues.lookup(V);
1468
1469 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1470 if (!C && CheckSROA)
1471 BaseAndOffset = ConstantOffsetPtrs.lookup(V);
1472
1473 if (!C && !BaseAndOffset.first)
1474 // The incoming value is neither a constant nor a pointer with constant
1475 // offset, exit early.
1476 return true;
1477
1478 if (FirstC) {
1479 if (FirstC == C)
1480 // If we've seen a constant incoming value before and it is the same
1481 // constant we see this time, continue checking the next incoming value.
1482 continue;
1483 // Otherwise early exit because we either see a different constant or saw
1484 // a constant before but we have a pointer with constant offset this time.
1485 return true;
1486 }
1487
1488 if (FirstV) {
1489 // The same logic as above, but check pointer with constant offset here.
1490 if (FirstBaseAndOffset == BaseAndOffset)
1491 continue;
1492 return true;
1493 }
1494
1495 if (C) {
1496 // This is the 1st time we've seen a constant, record it.
1497 FirstC = C;
1498 continue;
1499 }
1500
1501 // The remaining case is that this is the 1st time we've seen a pointer with
1502 // constant offset, record it.
1503 FirstV = V;
1504 FirstBaseAndOffset = BaseAndOffset;
1505 }
1506
1507 // Check if we can map phi to a constant.
1508 if (FirstC) {
1509 SimplifiedValues[&I] = FirstC;
1510 return true;
1511 }
1512
1513 // Check if we can map phi to a pointer with constant offset.
1514 if (FirstBaseAndOffset.first) {
1515 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1516
1517 if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
1518 SROAArgValues[&I] = SROAArg;
1519 }
1520
1521 return true;
1522}
1523
1524/// Check we can fold GEPs of constant-offset call site argument pointers.
1525/// This requires target data and inbounds GEPs.
1526///
1527/// \return true if the specified GEP can be folded.
1528bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1529 // Check if we have a base + offset for the pointer.
1530 std::pair<Value *, APInt> BaseAndOffset =
1531 ConstantOffsetPtrs.lookup(I.getPointerOperand());
1532 if (!BaseAndOffset.first)
1533 return false;
1534
1535 // Check if the offset of this GEP is constant, and if so accumulate it
1536 // into Offset.
1537 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
1538 return false;
1539
1540 // Add the result as a new mapping to Base + Offset.
1541 ConstantOffsetPtrs[&I] = BaseAndOffset;
1542
1543 return true;
1544}
1545
1546bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1547 auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
1548
1549 // Lambda to check whether a GEP's indices are all constant.
1550 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1551 for (const Use &Op : GEP.indices())
1552 if (!isa<Constant>(Op) && !SimplifiedValues.lookup(Op))
1553 return false;
1554 return true;
1555 };
1556
1559 return true;
1560
1561 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1562 if (SROAArg)
1563 SROAArgValues[&I] = SROAArg;
1564
1565 // Constant GEPs are modeled as free.
1566 return true;
1567 }
1568
1569 // Variable GEPs will require math and will disable SROA.
1570 if (SROAArg)
1571 disableSROAForArg(SROAArg);
1572 return isGEPFree(I);
1573}
1574
1575/// Simplify \p I if its operands are constants and update SimplifiedValues.
1576bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1578 for (Value *Op : I.operands()) {
1579 Constant *COp = dyn_cast<Constant>(Op);
1580 if (!COp)
1581 COp = SimplifiedValues.lookup(Op);
1582 if (!COp)
1583 return false;
1584 COps.push_back(COp);
1585 }
1586 auto *C = ConstantFoldInstOperands(&I, COps, DL);
1587 if (!C)
1588 return false;
1589 SimplifiedValues[&I] = C;
1590 return true;
1591}
1592
1593/// Try to simplify a call to llvm.is.constant.
1594///
1595/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1596/// we expect calls of this specific intrinsic to be infrequent.
1597///
1598/// FIXME: Given that we know CB's parent (F) caller
1599/// (CandidateCall->getParent()->getParent()), we might be able to determine
1600/// whether inlining F into F's caller would change how the call to
1601/// llvm.is.constant would evaluate.
1602bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1603 Value *Arg = CB.getArgOperand(0);
1604 auto *C = dyn_cast<Constant>(Arg);
1605
1606 if (!C)
1607 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(Arg));
1608
1609 Type *RT = CB.getFunctionType()->getReturnType();
1610 SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);
1611 return true;
1612}
1613
1614bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1615 // As per the langref, "The fourth argument to llvm.objectsize determines if
1616 // the value should be evaluated at runtime."
1617 if (cast<ConstantInt>(CB.getArgOperand(3))->isOne())
1618 return false;
1619
1620 Value *V = lowerObjectSizeCall(&cast<IntrinsicInst>(CB), DL, nullptr,
1621 /*MustSucceed=*/true);
1622 Constant *C = dyn_cast_or_null<Constant>(V);
1623 if (C)
1624 SimplifiedValues[&CB] = C;
1625 return C;
1626}
1627
1628bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1629 // Propagate constants through bitcasts.
1631 return true;
1632
1633 // Track base/offsets through casts
1634 std::pair<Value *, APInt> BaseAndOffset =
1635 ConstantOffsetPtrs.lookup(I.getOperand(0));
1636 // Casts don't change the offset, just wrap it up.
1637 if (BaseAndOffset.first)
1638 ConstantOffsetPtrs[&I] = BaseAndOffset;
1639
1640 // Also look for SROA candidates here.
1641 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1642 SROAArgValues[&I] = SROAArg;
1643
1644 // Bitcasts are always zero cost.
1645 return true;
1646}
1647
1648bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1649 // Propagate constants through ptrtoint.
1651 return true;
1652
1653 // Track base/offset pairs when converted to a plain integer provided the
1654 // integer is large enough to represent the pointer.
1655 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1656 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1657 if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1658 std::pair<Value *, APInt> BaseAndOffset =
1659 ConstantOffsetPtrs.lookup(I.getOperand(0));
1660 if (BaseAndOffset.first)
1661 ConstantOffsetPtrs[&I] = BaseAndOffset;
1662 }
1663
1664 // This is really weird. Technically, ptrtoint will disable SROA. However,
1665 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1666 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1667 // would block SROA would also block SROA if applied directly to a pointer,
1668 // and so we can just add the integer in here. The only places where SROA is
1669 // preserved either cannot fire on an integer, or won't in-and-of themselves
1670 // disable SROA (ext) w/o some later use that we would see and disable.
1671 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1672 SROAArgValues[&I] = SROAArg;
1673
1676}
1677
1678bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1679 // Propagate constants through ptrtoint.
1681 return true;
1682
1683 // Track base/offset pairs when round-tripped through a pointer without
1684 // modifications provided the integer is not too large.
1685 Value *Op = I.getOperand(0);
1686 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1687 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1688 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1689 if (BaseAndOffset.first)
1690 ConstantOffsetPtrs[&I] = BaseAndOffset;
1691 }
1692
1693 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1694 if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1695 SROAArgValues[&I] = SROAArg;
1696
1699}
1700
1701bool CallAnalyzer::visitCastInst(CastInst &I) {
1702 // Propagate constants through casts.
1704 return true;
1705
1706 // Disable SROA in the face of arbitrary casts we don't explicitly list
1707 // elsewhere.
1708 disableSROA(I.getOperand(0));
1709
1710 // If this is a floating-point cast, and the target says this operation
1711 // is expensive, this may eventually become a library call. Treat the cost
1712 // as such.
1713 switch (I.getOpcode()) {
1714 case Instruction::FPTrunc:
1715 case Instruction::FPExt:
1716 case Instruction::UIToFP:
1717 case Instruction::SIToFP:
1718 case Instruction::FPToUI:
1719 case Instruction::FPToSI:
1721 onCallPenalty();
1722 break;
1723 default:
1724 break;
1725 }
1726
1729}
1730
1731bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1732 return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1733}
1734
1735bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1736 // Does the *call site* have the NonNull attribute set on an argument? We
1737 // use the attribute on the call site to memoize any analysis done in the
1738 // caller. This will also trip if the callee function has a non-null
1739 // parameter attribute, but that's a less interesting case because hopefully
1740 // the callee would already have been simplified based on that.
1741 if (Argument *A = dyn_cast<Argument>(V))
1742 if (paramHasAttr(A, Attribute::NonNull))
1743 return true;
1744
1745 // Is this an alloca in the caller? This is distinct from the attribute case
1746 // above because attributes aren't updated within the inliner itself and we
1747 // always want to catch the alloca derived case.
1748 if (isAllocaDerivedArg(V))
1749 // We can actually predict the result of comparisons between an
1750 // alloca-derived value and null. Note that this fires regardless of
1751 // SROA firing.
1752 return true;
1753
1754 return false;
1755}
1756
1757bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1758 // If the normal destination of the invoke or the parent block of the call
1759 // site is unreachable-terminated, there is little point in inlining this
1760 // unless there is literally zero cost.
1761 // FIXME: Note that it is possible that an unreachable-terminated block has a
1762 // hot entry. For example, in below scenario inlining hot_call_X() may be
1763 // beneficial :
1764 // main() {
1765 // hot_call_1();
1766 // ...
1767 // hot_call_N()
1768 // exit(0);
1769 // }
1770 // For now, we are not handling this corner case here as it is rare in real
1771 // code. In future, we should elaborate this based on BPI and BFI in more
1772 // general threshold adjusting heuristics in updateThreshold().
1773 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
1774 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
1775 return false;
1776 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
1777 return false;
1778
1779 return true;
1780}
1781
1782bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1783 BlockFrequencyInfo *CallerBFI) {
1784 // If global profile summary is available, then callsite's coldness is
1785 // determined based on that.
1786 if (PSI && PSI->hasProfileSummary())
1787 return PSI->isColdCallSite(Call, CallerBFI);
1788
1789 // Otherwise we need BFI to be available.
1790 if (!CallerBFI)
1791 return false;
1792
1793 // Determine if the callsite is cold relative to caller's entry. We could
1794 // potentially cache the computation of scaled entry frequency, but the added
1795 // complexity is not worth it unless this scaling shows up high in the
1796 // profiles.
1797 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1798 auto CallSiteBB = Call.getParent();
1799 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1800 auto CallerEntryFreq =
1801 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
1802 return CallSiteFreq < CallerEntryFreq * ColdProb;
1803}
1804
1805std::optional<int>
1806InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1807 BlockFrequencyInfo *CallerBFI) {
1808
1809 // If global profile summary is available, then callsite's hotness is
1810 // determined based on that.
1811 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
1812 return Params.HotCallSiteThreshold;
1813
1814 // Otherwise we need BFI to be available and to have a locally hot callsite
1815 // threshold.
1816 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1817 return std::nullopt;
1818
1819 // Determine if the callsite is hot relative to caller's entry. We could
1820 // potentially cache the computation of scaled entry frequency, but the added
1821 // complexity is not worth it unless this scaling shows up high in the
1822 // profiles.
1823 const BasicBlock *CallSiteBB = Call.getParent();
1824 BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1825 BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
1826 std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(HotCallSiteRelFreq);
1827 if (Limit && CallSiteFreq >= *Limit)
1828 return Params.LocallyHotCallSiteThreshold;
1829
1830 // Otherwise treat it normally.
1831 return std::nullopt;
1832}
1833
1834void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1835 // If no size growth is allowed for this inlining, set Threshold to 0.
1836 if (!allowSizeGrowth(Call)) {
1837 Threshold = 0;
1838 return;
1839 }
1840
1841 Function *Caller = Call.getCaller();
1842
1843 // return min(A, B) if B is valid.
1844 auto MinIfValid = [](int A, std::optional<int> B) {
1845 return B ? std::min(A, *B) : A;
1846 };
1847
1848 // return max(A, B) if B is valid.
1849 auto MaxIfValid = [](int A, std::optional<int> B) {
1850 return B ? std::max(A, *B) : A;
1851 };
1852
1853 // Various bonus percentages. These are multiplied by Threshold to get the
1854 // bonus values.
1855 // SingleBBBonus: This bonus is applied if the callee has a single reachable
1856 // basic block at the given callsite context. This is speculatively applied
1857 // and withdrawn if more than one basic block is seen.
1858 //
1859 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1860 // of the last call to a static function as inlining such functions is
1861 // guaranteed to reduce code size.
1862 //
1863 // These bonus percentages may be set to 0 based on properties of the caller
1864 // and the callsite.
1865 int SingleBBBonusPercent = 50;
1866 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1868
1869 // Lambda to set all the above bonus and bonus percentages to 0.
1870 auto DisallowAllBonuses = [&]() {
1871 SingleBBBonusPercent = 0;
1872 VectorBonusPercent = 0;
1874 };
1875
1876 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1877 // and reduce the threshold if the caller has the necessary attribute.
1878 if (Caller->hasMinSize()) {
1879 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1880 // For minsize, we want to disable the single BB bonus and the vector
1881 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1882 // a static function will, at the minimum, eliminate the parameter setup and
1883 // call/return instructions.
1884 SingleBBBonusPercent = 0;
1885 VectorBonusPercent = 0;
1886 } else if (Caller->hasOptSize())
1887 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1888
1889 // Adjust the threshold based on inlinehint attribute and profile based
1890 // hotness information if the caller does not have MinSize attribute.
1891 if (!Caller->hasMinSize()) {
1892 if (Callee.hasFnAttribute(Attribute::InlineHint))
1893 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1894
1895 // FIXME: After switching to the new passmanager, simplify the logic below
1896 // by checking only the callsite hotness/coldness as we will reliably
1897 // have local profile information.
1898 //
1899 // Callsite hotness and coldness can be determined if sample profile is
1900 // used (which adds hotness metadata to calls) or if caller's
1901 // BlockFrequencyInfo is available.
1902 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
1903 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1904 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1905 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1906 // FIXME: This should update the threshold only if it exceeds the
1907 // current threshold, but AutoFDO + ThinLTO currently relies on this
1908 // behavior to prevent inlining of hot callsites during ThinLTO
1909 // compile phase.
1910 Threshold = *HotCallSiteThreshold;
1911 } else if (isColdCallSite(Call, CallerBFI)) {
1912 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1913 // Do not apply bonuses for a cold callsite including the
1914 // LastCallToStatic bonus. While this bonus might result in code size
1915 // reduction, it can cause the size of a non-cold caller to increase
1916 // preventing it from being inlined.
1917 DisallowAllBonuses();
1918 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
1919 } else if (PSI) {
1920 // Use callee's global profile information only if we have no way of
1921 // determining this via callsite information.
1922 if (PSI->isFunctionEntryHot(&Callee)) {
1923 LLVM_DEBUG(dbgs() << "Hot callee.\n");
1924 // If callsite hotness can not be determined, we may still know
1925 // that the callee is hot and treat it as a weaker hint for threshold
1926 // increase.
1927 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1928 } else if (PSI->isFunctionEntryCold(&Callee)) {
1929 LLVM_DEBUG(dbgs() << "Cold callee.\n");
1930 // Do not apply bonuses for a cold callee including the
1931 // LastCallToStatic bonus. While this bonus might result in code size
1932 // reduction, it can cause the size of a non-cold caller to increase
1933 // preventing it from being inlined.
1934 DisallowAllBonuses();
1935 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
1936 }
1937 }
1938 }
1939
1940 Threshold += TTI.adjustInliningThreshold(&Call);
1941
1942 // Finally, take the target-specific inlining threshold multiplier into
1943 // account.
1944 Threshold *= TTI.getInliningThresholdMultiplier();
1945
1946 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1947 VectorBonus = Threshold * VectorBonusPercent / 100;
1948
1949 // If there is only one call of the function, and it has internal linkage,
1950 // the cost of inlining it drops dramatically. It may seem odd to update
1951 // Cost in updateThreshold, but the bonus depends on the logic in this method.
1952 if (isSoleCallToLocalFunction(Call, F)) {
1954 StaticBonusApplied = LastCallToStaticBonus;
1955 }
1956}
1957
1958bool CallAnalyzer::visitCmpInst(CmpInst &I) {
1959 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1960 // First try to handle simplified comparisons.
1962 return true;
1963
1964 if (I.getOpcode() == Instruction::FCmp)
1965 return false;
1966
1967 // Otherwise look for a comparison between constant offset pointers with
1968 // a common base.
1969 Value *LHSBase, *RHSBase;
1970 APInt LHSOffset, RHSOffset;
1971 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1972 if (LHSBase) {
1973 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1974 if (RHSBase && LHSBase == RHSBase) {
1975 // We have common bases, fold the icmp to a constant based on the
1976 // offsets.
1977 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1978 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1979 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
1980 SimplifiedValues[&I] = C;
1981 ++NumConstantPtrCmps;
1982 return true;
1983 }
1984 }
1985 }
1986
1987 auto isImplicitNullCheckCmp = [](const CmpInst &I) {
1988 for (auto *User : I.users())
1989 if (auto *Instr = dyn_cast<Instruction>(User))
1990 if (!Instr->getMetadata(LLVMContext::MD_make_implicit))
1991 return false;
1992 return true;
1993 };
1994
1995 // If the comparison is an equality comparison with null, we can simplify it
1996 // if we know the value (argument) can't be null
1997 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1))) {
1998 if (isKnownNonNullInCallee(I.getOperand(0))) {
1999 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2000 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
2002 return true;
2003 }
2004 // Implicit null checks act as unconditional branches and their comparisons
2005 // should be treated as simplified and free of cost.
2006 if (isImplicitNullCheckCmp(I))
2007 return true;
2008 }
2009 return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
2010}
2011
2012bool CallAnalyzer::visitSub(BinaryOperator &I) {
2013 // Try to handle a special case: we can fold computing the difference of two
2014 // constant-related pointers.
2015 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2016 Value *LHSBase, *RHSBase;
2017 APInt LHSOffset, RHSOffset;
2018 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2019 if (LHSBase) {
2020 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2021 if (RHSBase && LHSBase == RHSBase) {
2022 // We have common bases, fold the subtract to a constant based on the
2023 // offsets.
2024 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2025 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2026 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
2027 SimplifiedValues[&I] = C;
2028 ++NumConstantPtrDiffs;
2029 return true;
2030 }
2031 }
2032 }
2033
2034 // Otherwise, fall back to the generic logic for simplifying and handling
2035 // instructions.
2036 return Base::visitSub(I);
2037}
2038
2039bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2040 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2041 Constant *CLHS = dyn_cast<Constant>(LHS);
2042 if (!CLHS)
2043 CLHS = SimplifiedValues.lookup(LHS);
2044 Constant *CRHS = dyn_cast<Constant>(RHS);
2045 if (!CRHS)
2046 CRHS = SimplifiedValues.lookup(RHS);
2047
2048 Value *SimpleV = nullptr;
2049 if (auto FI = dyn_cast<FPMathOperator>(&I))
2050 SimpleV = simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
2051 FI->getFastMathFlags(), DL);
2052 else
2053 SimpleV =
2054 simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
2055
2056 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2057 SimplifiedValues[&I] = C;
2058
2059 if (SimpleV)
2060 return true;
2061
2062 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2063 disableSROA(LHS);
2064 disableSROA(RHS);
2065
2066 // If the instruction is floating point, and the target says this operation
2067 // is expensive, this may eventually become a library call. Treat the cost
2068 // as such. Unless it's fneg which can be implemented with an xor.
2069 using namespace llvm::PatternMatch;
2070 if (I.getType()->isFloatingPointTy() &&
2072 !match(&I, m_FNeg(m_Value())))
2073 onCallPenalty();
2074
2075 return false;
2076}
2077
2078bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2079 Value *Op = I.getOperand(0);
2080 Constant *COp = dyn_cast<Constant>(Op);
2081 if (!COp)
2082 COp = SimplifiedValues.lookup(Op);
2083
2084 Value *SimpleV = simplifyFNegInst(
2085 COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
2086
2087 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2088 SimplifiedValues[&I] = C;
2089
2090 if (SimpleV)
2091 return true;
2092
2093 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2094 disableSROA(Op);
2095
2096 return false;
2097}
2098
2099bool CallAnalyzer::visitLoad(LoadInst &I) {
2100 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2101 return true;
2102
2103 // If the data is already loaded from this address and hasn't been clobbered
2104 // by any stores or calls, this load is likely to be redundant and can be
2105 // eliminated.
2106 if (EnableLoadElimination &&
2107 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
2108 onLoadEliminationOpportunity();
2109 return true;
2110 }
2111
2112 onMemAccess();
2113 return false;
2114}
2115
2116bool CallAnalyzer::visitStore(StoreInst &I) {
2117 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2118 return true;
2119
2120 // The store can potentially clobber loads and prevent repeated loads from
2121 // being eliminated.
2122 // FIXME:
2123 // 1. We can probably keep an initial set of eliminatable loads substracted
2124 // from the cost even when we finally see a store. We just need to disable
2125 // *further* accumulation of elimination savings.
2126 // 2. We should probably at some point thread MemorySSA for the callee into
2127 // this and then use that to actually compute *really* precise savings.
2128 disableLoadElimination();
2129
2130 onMemAccess();
2131 return false;
2132}
2133
2134bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2135 // Constant folding for extract value is trivial.
2137 return true;
2138
2139 // SROA can't look through these, but they may be free.
2140 return Base::visitExtractValue(I);
2141}
2142
2143bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2144 // Constant folding for insert value is trivial.
2146 return true;
2147
2148 // SROA can't look through these, but they may be free.
2149 return Base::visitInsertValue(I);
2150}
2151
2152/// Try to simplify a call site.
2153///
2154/// Takes a concrete function and callsite and tries to actually simplify it by
2155/// analyzing the arguments and call itself with instsimplify. Returns true if
2156/// it has simplified the callsite to some other entity (a constant), making it
2157/// free.
2158bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2159 // FIXME: Using the instsimplify logic directly for this is inefficient
2160 // because we have to continually rebuild the argument list even when no
2161 // simplifications can be performed. Until that is fixed with remapping
2162 // inside of instsimplify, directly constant fold calls here.
2163 if (!canConstantFoldCallTo(&Call, F))
2164 return false;
2165
2166 // Try to re-map the arguments to constants.
2167 SmallVector<Constant *, 4> ConstantArgs;
2168 ConstantArgs.reserve(Call.arg_size());
2169 for (Value *I : Call.args()) {
2170 Constant *C = dyn_cast<Constant>(I);
2171 if (!C)
2172 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
2173 if (!C)
2174 return false; // This argument doesn't map to a constant.
2175
2176 ConstantArgs.push_back(C);
2177 }
2178 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
2179 SimplifiedValues[&Call] = C;
2180 return true;
2181 }
2182
2183 return false;
2184}
2185
2186bool CallAnalyzer::visitCallBase(CallBase &Call) {
2187 if (!onCallBaseVisitStart(Call))
2188 return true;
2189
2190 if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
2191 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
2192 // This aborts the entire analysis.
2193 ExposesReturnsTwice = true;
2194 return false;
2195 }
2196 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
2197 ContainsNoDuplicateCall = true;
2198
2199 Function *F = Call.getCalledFunction();
2200 bool IsIndirectCall = !F;
2201 if (IsIndirectCall) {
2202 // Check if this happens to be an indirect function call to a known function
2203 // in this inline context. If not, we've done all we can.
2204 Value *Callee = Call.getCalledOperand();
2205 F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
2206 if (!F || F->getFunctionType() != Call.getFunctionType()) {
2207 onCallArgumentSetup(Call);
2208
2209 if (!Call.onlyReadsMemory())
2210 disableLoadElimination();
2211 return Base::visitCallBase(Call);
2212 }
2213 }
2214
2215 assert(F && "Expected a call to a known function");
2216
2217 // When we have a concrete function, first try to simplify it directly.
2218 if (simplifyCallSite(F, Call))
2219 return true;
2220
2221 // Next check if it is an intrinsic we know about.
2222 // FIXME: Lift this into part of the InstVisitor.
2223 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
2224 switch (II->getIntrinsicID()) {
2225 default:
2226 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
2227 disableLoadElimination();
2228 return Base::visitCallBase(Call);
2229
2230 case Intrinsic::load_relative:
2231 onLoadRelativeIntrinsic();
2232 return false;
2233
2234 case Intrinsic::memset:
2235 case Intrinsic::memcpy:
2236 case Intrinsic::memmove:
2237 disableLoadElimination();
2238 // SROA can usually chew through these intrinsics, but they aren't free.
2239 return false;
2240 case Intrinsic::icall_branch_funnel:
2241 case Intrinsic::localescape:
2242 HasUninlineableIntrinsic = true;
2243 return false;
2244 case Intrinsic::vastart:
2245 InitsVargArgs = true;
2246 return false;
2247 case Intrinsic::launder_invariant_group:
2248 case Intrinsic::strip_invariant_group:
2249 if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
2250 SROAArgValues[II] = SROAArg;
2251 return true;
2252 case Intrinsic::is_constant:
2253 return simplifyIntrinsicCallIsConstant(Call);
2254 case Intrinsic::objectsize:
2255 return simplifyIntrinsicCallObjectSize(Call);
2256 }
2257 }
2258
2259 if (F == Call.getFunction()) {
2260 // This flag will fully abort the analysis, so don't bother with anything
2261 // else.
2262 IsRecursiveCall = true;
2263 if (!AllowRecursiveCall)
2264 return false;
2265 }
2266
2267 if (TTI.isLoweredToCall(F)) {
2268 onLoweredCall(F, Call, IsIndirectCall);
2269 }
2270
2271 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2272 disableLoadElimination();
2273 return Base::visitCallBase(Call);
2274}
2275
2276bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2277 // At least one return instruction will be free after inlining.
2278 bool Free = !HasReturn;
2279 HasReturn = true;
2280 return Free;
2281}
2282
2283bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2284 // We model unconditional branches as essentially free -- they really
2285 // shouldn't exist at all, but handling them makes the behavior of the
2286 // inliner more regular and predictable. Interestingly, conditional branches
2287 // which will fold away are also free.
2288 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
2289 BI.getMetadata(LLVMContext::MD_make_implicit) ||
2290 isa_and_nonnull<ConstantInt>(
2291 SimplifiedValues.lookup(BI.getCondition()));
2292}
2293
2294bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2295 bool CheckSROA = SI.getType()->isPointerTy();
2296 Value *TrueVal = SI.getTrueValue();
2297 Value *FalseVal = SI.getFalseValue();
2298
2299 Constant *TrueC = dyn_cast<Constant>(TrueVal);
2300 if (!TrueC)
2301 TrueC = SimplifiedValues.lookup(TrueVal);
2302 Constant *FalseC = dyn_cast<Constant>(FalseVal);
2303 if (!FalseC)
2304 FalseC = SimplifiedValues.lookup(FalseVal);
2305 Constant *CondC =
2306 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
2307
2308 if (!CondC) {
2309 // Select C, X, X => X
2310 if (TrueC == FalseC && TrueC) {
2311 SimplifiedValues[&SI] = TrueC;
2312 return true;
2313 }
2314
2315 if (!CheckSROA)
2316 return Base::visitSelectInst(SI);
2317
2318 std::pair<Value *, APInt> TrueBaseAndOffset =
2319 ConstantOffsetPtrs.lookup(TrueVal);
2320 std::pair<Value *, APInt> FalseBaseAndOffset =
2321 ConstantOffsetPtrs.lookup(FalseVal);
2322 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2323 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2324
2325 if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
2326 SROAArgValues[&SI] = SROAArg;
2327 return true;
2328 }
2329
2330 return Base::visitSelectInst(SI);
2331 }
2332
2333 // Select condition is a constant.
2334 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2335 : (CondC->isNullValue()) ? FalseVal
2336 : nullptr;
2337 if (!SelectedV) {
2338 // Condition is a vector constant that is not all 1s or all 0s. If all
2339 // operands are constants, ConstantFoldSelectInstruction() can handle the
2340 // cases such as select vectors.
2341 if (TrueC && FalseC) {
2342 if (auto *C = ConstantFoldSelectInstruction(CondC, TrueC, FalseC)) {
2343 SimplifiedValues[&SI] = C;
2344 return true;
2345 }
2346 }
2347 return Base::visitSelectInst(SI);
2348 }
2349
2350 // Condition is either all 1s or all 0s. SI can be simplified.
2351 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
2352 SimplifiedValues[&SI] = SelectedC;
2353 return true;
2354 }
2355
2356 if (!CheckSROA)
2357 return true;
2358
2359 std::pair<Value *, APInt> BaseAndOffset =
2360 ConstantOffsetPtrs.lookup(SelectedV);
2361 if (BaseAndOffset.first) {
2362 ConstantOffsetPtrs[&SI] = BaseAndOffset;
2363
2364 if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
2365 SROAArgValues[&SI] = SROAArg;
2366 }
2367
2368 return true;
2369}
2370
2371bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2372 // We model unconditional switches as free, see the comments on handling
2373 // branches.
2374 if (isa<ConstantInt>(SI.getCondition()))
2375 return true;
2376 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
2377 if (isa<ConstantInt>(V))
2378 return true;
2379
2380 // Assume the most general case where the switch is lowered into
2381 // either a jump table, bit test, or a balanced binary tree consisting of
2382 // case clusters without merging adjacent clusters with the same
2383 // destination. We do not consider the switches that are lowered with a mix
2384 // of jump table/bit test/binary search tree. The cost of the switch is
2385 // proportional to the size of the tree or the size of jump table range.
2386 //
2387 // NB: We convert large switches which are just used to initialize large phi
2388 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2389 // inlining those. It will prevent inlining in cases where the optimization
2390 // does not (yet) fire.
2391
2392 unsigned JumpTableSize = 0;
2393 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2394 unsigned NumCaseCluster =
2395 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
2396
2397 onFinalizeSwitch(JumpTableSize, NumCaseCluster);
2398 return false;
2399}
2400
2401bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2402 // We never want to inline functions that contain an indirectbr. This is
2403 // incorrect because all the blockaddress's (in static global initializers
2404 // for example) would be referring to the original function, and this
2405 // indirect jump would jump from the inlined copy of the function into the
2406 // original function which is extremely undefined behavior.
2407 // FIXME: This logic isn't really right; we can safely inline functions with
2408 // indirectbr's as long as no other function or global references the
2409 // blockaddress of a block within the current function.
2410 HasIndirectBr = true;
2411 return false;
2412}
2413
2414bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2415 // FIXME: It's not clear that a single instruction is an accurate model for
2416 // the inline cost of a resume instruction.
2417 return false;
2418}
2419
2420bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2421 // FIXME: It's not clear that a single instruction is an accurate model for
2422 // the inline cost of a cleanupret instruction.
2423 return false;
2424}
2425
2426bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2427 // FIXME: It's not clear that a single instruction is an accurate model for
2428 // the inline cost of a catchret instruction.
2429 return false;
2430}
2431
2432bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2433 // FIXME: It might be reasonably to discount the cost of instructions leading
2434 // to unreachable as they have the lowest possible impact on both runtime and
2435 // code size.
2436 return true; // No actual code is needed for unreachable.
2437}
2438
2439bool CallAnalyzer::visitInstruction(Instruction &I) {
2440 // Some instructions are free. All of the free intrinsics can also be
2441 // handled by SROA, etc.
2444 return true;
2445
2446 // We found something we don't understand or can't handle. Mark any SROA-able
2447 // values in the operand list as no longer viable.
2448 for (const Use &Op : I.operands())
2449 disableSROA(Op);
2450
2451 return false;
2452}
2453
2454/// Analyze a basic block for its contribution to the inline cost.
2455///
2456/// This method walks the analyzer over every instruction in the given basic
2457/// block and accounts for their cost during inlining at this callsite. It
2458/// aborts early if the threshold has been exceeded or an impossible to inline
2459/// construct has been detected. It returns false if inlining is no longer
2460/// viable, and true if inlining remains viable.
2462CallAnalyzer::analyzeBlock(BasicBlock *BB,
2463 SmallPtrSetImpl<const Value *> &EphValues) {
2464 for (Instruction &I : *BB) {
2465 // FIXME: Currently, the number of instructions in a function regardless of
2466 // our ability to simplify them during inline to constants or dead code,
2467 // are actually used by the vector bonus heuristic. As long as that's true,
2468 // we have to special case debug intrinsics here to prevent differences in
2469 // inlining due to debug symbols. Eventually, the number of unsimplified
2470 // instructions shouldn't factor into the cost computation, but until then,
2471 // hack around it here.
2472 // Similarly, skip pseudo-probes.
2473 if (I.isDebugOrPseudoInst())
2474 continue;
2475
2476 // Skip ephemeral values.
2477 if (EphValues.count(&I))
2478 continue;
2479
2480 ++NumInstructions;
2481 if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
2482 ++NumVectorInstructions;
2483
2484 // If the instruction simplified to a constant, there is no cost to this
2485 // instruction. Visit the instructions using our InstVisitor to account for
2486 // all of the per-instruction logic. The visit tree returns true if we
2487 // consumed the instruction in any way, and false if the instruction's base
2488 // cost should count against inlining.
2489 onInstructionAnalysisStart(&I);
2490
2491 if (Base::visit(&I))
2492 ++NumInstructionsSimplified;
2493 else
2494 onMissedSimplification();
2495
2496 onInstructionAnalysisFinish(&I);
2497 using namespace ore;
2498 // If the visit this instruction detected an uninlinable pattern, abort.
2500 if (IsRecursiveCall && !AllowRecursiveCall)
2501 IR = InlineResult::failure("recursive");
2502 else if (ExposesReturnsTwice)
2503 IR = InlineResult::failure("exposes returns twice");
2504 else if (HasDynamicAlloca)
2505 IR = InlineResult::failure("dynamic alloca");
2506 else if (HasIndirectBr)
2507 IR = InlineResult::failure("indirect branch");
2508 else if (HasUninlineableIntrinsic)
2509 IR = InlineResult::failure("uninlinable intrinsic");
2510 else if (InitsVargArgs)
2511 IR = InlineResult::failure("varargs");
2512 if (!IR.isSuccess()) {
2513 if (ORE)
2514 ORE->emit([&]() {
2515 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2516 &CandidateCall)
2517 << NV("Callee", &F) << " has uninlinable pattern ("
2518 << NV("InlineResult", IR.getFailureReason())
2519 << ") and cost is not fully computed";
2520 });
2521 return IR;
2522 }
2523
2524 // If the caller is a recursive function then we don't want to inline
2525 // functions which allocate a lot of stack space because it would increase
2526 // the caller stack usage dramatically.
2527 if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2528 auto IR =
2529 InlineResult::failure("recursive and allocates too much stack space");
2530 if (ORE)
2531 ORE->emit([&]() {
2532 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2533 &CandidateCall)
2534 << NV("Callee", &F) << " is "
2535 << NV("InlineResult", IR.getFailureReason())
2536 << ". Cost is not fully computed";
2537 });
2538 return IR;
2539 }
2540
2541 if (shouldStop())
2542 return InlineResult::failure(
2543 "Call site analysis is not favorable to inlining.");
2544 }
2545
2546 return InlineResult::success();
2547}
2548
2549/// Compute the base pointer and cumulative constant offsets for V.
2550///
2551/// This strips all constant offsets off of V, leaving it the base pointer, and
2552/// accumulates the total constant offset applied in the returned constant. It
2553/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2554/// no constant offsets applied.
2555ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2556 if (!V->getType()->isPointerTy())
2557 return nullptr;
2558
2559 unsigned AS = V->getType()->getPointerAddressSpace();
2560 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2561 APInt Offset = APInt::getZero(IntPtrWidth);
2562
2563 // Even though we don't look through PHI nodes, we could be called on an
2564 // instruction in an unreachable block, which may be on a cycle.
2566 Visited.insert(V);
2567 do {
2568 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
2569 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
2570 return nullptr;
2571 V = GEP->getPointerOperand();
2572 } else if (Operator::getOpcode(V) == Instruction::BitCast) {
2573 V = cast<Operator>(V)->getOperand(0);
2574 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
2575 if (GA->isInterposable())
2576 break;
2577 V = GA->getAliasee();
2578 } else {
2579 break;
2580 }
2581 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2582 } while (Visited.insert(V).second);
2583
2584 Type *IdxPtrTy = DL.getIndexType(V->getType());
2585 return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
2586}
2587
2588/// Find dead blocks due to deleted CFG edges during inlining.
2589///
2590/// If we know the successor of the current block, \p CurrBB, has to be \p
2591/// NextBB, the other successors of \p CurrBB are dead if these successors have
2592/// no live incoming CFG edges. If one block is found to be dead, we can
2593/// continue growing the dead block list by checking the successors of the dead
2594/// blocks to see if all their incoming edges are dead or not.
2595void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2596 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2597 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2598 // known successor which is not the one under exam.
2599 return (DeadBlocks.count(Pred) ||
2600 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
2601 };
2602
2603 auto IsNewlyDead = [&](BasicBlock *BB) {
2604 // If all the edges to a block are dead, the block is also dead.
2605 return (!DeadBlocks.count(BB) &&
2607 [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2608 };
2609
2610 for (BasicBlock *Succ : successors(CurrBB)) {
2611 if (Succ == NextBB || !IsNewlyDead(Succ))
2612 continue;
2614 NewDead.push_back(Succ);
2615 while (!NewDead.empty()) {
2616 BasicBlock *Dead = NewDead.pop_back_val();
2617 if (DeadBlocks.insert(Dead).second)
2618 // Continue growing the dead block lists.
2619 for (BasicBlock *S : successors(Dead))
2620 if (IsNewlyDead(S))
2621 NewDead.push_back(S);
2622 }
2623 }
2624}
2625
2626/// Analyze a call site for potential inlining.
2627///
2628/// Returns true if inlining this call is viable, and false if it is not
2629/// viable. It computes the cost and adjusts the threshold based on numerous
2630/// factors and heuristics. If this method returns false but the computed cost
2631/// is below the computed threshold, then inlining was forcibly disabled by
2632/// some artifact of the routine.
2633InlineResult CallAnalyzer::analyze() {
2634 ++NumCallsAnalyzed;
2635
2636 auto Result = onAnalysisStart();
2637 if (!Result.isSuccess())
2638 return Result;
2639
2640 if (F.empty())
2641 return InlineResult::success();
2642
2643 Function *Caller = CandidateCall.getFunction();
2644 // Check if the caller function is recursive itself.
2645 for (User *U : Caller->users()) {
2646 CallBase *Call = dyn_cast<CallBase>(U);
2647 if (Call && Call->getFunction() == Caller) {
2648 IsCallerRecursive = true;
2649 break;
2650 }
2651 }
2652
2653 // Populate our simplified values by mapping from function arguments to call
2654 // arguments with known important simplifications.
2655 auto CAI = CandidateCall.arg_begin();
2656 for (Argument &FAI : F.args()) {
2657 assert(CAI != CandidateCall.arg_end());
2658 if (Constant *C = dyn_cast<Constant>(CAI))
2659 SimplifiedValues[&FAI] = C;
2660
2661 Value *PtrArg = *CAI;
2662 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2663 ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
2664
2665 // We can SROA any pointer arguments derived from alloca instructions.
2666 if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2667 SROAArgValues[&FAI] = SROAArg;
2668 onInitializeSROAArg(SROAArg);
2669 EnabledSROAAllocas.insert(SROAArg);
2670 }
2671 }
2672 ++CAI;
2673 }
2674 NumConstantArgs = SimplifiedValues.size();
2675 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2676 NumAllocaArgs = SROAArgValues.size();
2677
2678 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
2679 // the ephemeral values multiple times (and they're completely determined by
2680 // the callee, so this is purely duplicate work).
2682 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
2683
2684 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2685 // adding basic blocks of the callee which can be proven to be dead for this
2686 // particular call site in order to get more accurate cost estimates. This
2687 // requires a somewhat heavyweight iteration pattern: we need to walk the
2688 // basic blocks in a breadth-first order as we insert live successors. To
2689 // accomplish this, prioritizing for small iterations because we exit after
2690 // crossing our threshold, we use a small-size optimized SetVector.
2692 BBSetVector BBWorklist;
2693 BBWorklist.insert(&F.getEntryBlock());
2694
2695 // Note that we *must not* cache the size, this loop grows the worklist.
2696 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2697 if (shouldStop())
2698 break;
2699
2700 BasicBlock *BB = BBWorklist[Idx];
2701 if (BB->empty())
2702 continue;
2703
2704 onBlockStart(BB);
2705
2706 // Disallow inlining a blockaddress with uses other than strictly callbr.
2707 // A blockaddress only has defined behavior for an indirect branch in the
2708 // same function, and we do not currently support inlining indirect
2709 // branches. But, the inliner may not see an indirect branch that ends up
2710 // being dead code at a particular call site. If the blockaddress escapes
2711 // the function, e.g., via a global variable, inlining may lead to an
2712 // invalid cross-function reference.
2713 // FIXME: pr/39560: continue relaxing this overt restriction.
2714 if (BB->hasAddressTaken())
2715 for (User *U : BlockAddress::get(&*BB)->users())
2716 if (!isa<CallBrInst>(*U))
2717 return InlineResult::failure("blockaddress used outside of callbr");
2718
2719 // Analyze the cost of this block. If we blow through the threshold, this
2720 // returns false, and we can bail on out.
2721 InlineResult IR = analyzeBlock(BB, EphValues);
2722 if (!IR.isSuccess())
2723 return IR;
2724
2725 Instruction *TI = BB->getTerminator();
2726
2727 // Add in the live successors by first checking whether we have terminator
2728 // that may be simplified based on the values simplified by this call.
2729 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
2730 if (BI->isConditional()) {
2731 Value *Cond = BI->getCondition();
2732 if (ConstantInt *SimpleCond =
2733 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2734 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
2735 BBWorklist.insert(NextBB);
2736 KnownSuccessors[BB] = NextBB;
2737 findDeadBlocks(BB, NextBB);
2738 continue;
2739 }
2740 }
2741 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
2742 Value *Cond = SI->getCondition();
2743 if (ConstantInt *SimpleCond =
2744 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2745 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
2746 BBWorklist.insert(NextBB);
2747 KnownSuccessors[BB] = NextBB;
2748 findDeadBlocks(BB, NextBB);
2749 continue;
2750 }
2751 }
2752
2753 // If we're unable to select a particular successor, just count all of
2754 // them.
2755 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
2756 ++TIdx)
2757 BBWorklist.insert(TI->getSuccessor(TIdx));
2758
2759 onBlockAnalyzed(BB);
2760 }
2761
2762 // If this is a noduplicate call, we can still inline as long as
2763 // inlining this would cause the removal of the caller (so the instruction
2764 // is not actually duplicated, just moved).
2765 if (!isSoleCallToLocalFunction(CandidateCall, F) && ContainsNoDuplicateCall)
2766 return InlineResult::failure("noduplicate");
2767
2768 // If the callee's stack size exceeds the user-specified threshold,
2769 // do not let it be inlined.
2770 // The command line option overrides a limit set in the function attributes.
2771 size_t FinalStackSizeThreshold = StackSizeThreshold;
2772 if (!StackSizeThreshold.getNumOccurrences())
2773 if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
2775 FinalStackSizeThreshold = *AttrMaxStackSize;
2776 if (AllocatedSize > FinalStackSizeThreshold)
2777 return InlineResult::failure("stacksize");
2778
2779 return finalizeAnalysis();
2780}
2781
2782void InlineCostCallAnalyzer::print(raw_ostream &OS) {
2783#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
2785 F.print(OS, &Writer);
2786 DEBUG_PRINT_STAT(NumConstantArgs);
2787 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2788 DEBUG_PRINT_STAT(NumAllocaArgs);
2789 DEBUG_PRINT_STAT(NumConstantPtrCmps);
2790 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2791 DEBUG_PRINT_STAT(NumInstructionsSimplified);
2792 DEBUG_PRINT_STAT(NumInstructions);
2793 DEBUG_PRINT_STAT(SROACostSavings);
2794 DEBUG_PRINT_STAT(SROACostSavingsLost);
2795 DEBUG_PRINT_STAT(LoadEliminationCost);
2796 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2798 DEBUG_PRINT_STAT(Threshold);
2799#undef DEBUG_PRINT_STAT
2800}
2801
2802#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2803/// Dump stats about this call's analysis.
2804LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
2805#endif
2806
2807/// Test that there are no attribute conflicts between Caller and Callee
2808/// that prevent inlining.
2810 Function *Caller, Function *Callee,
2811 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
2812 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2813 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2814 // object, and always returns the same object (which is overwritten on each
2815 // GetTLI call). Therefore we copy the first result.
2816 auto CalleeTLI = GetTLI(*Callee);
2817 return GetTLI(*Caller).areInlineCompatible(CalleeTLI,
2819 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
2820}
2821
2823 int64_t Cost = 0;
2824 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2825 if (Call.isByValArgument(I)) {
2826 // We approximate the number of loads and stores needed by dividing the
2827 // size of the byval type by the target's pointer size.
2828 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2829 unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));
2830 unsigned AS = PTy->getAddressSpace();
2831 unsigned PointerSize = DL.getPointerSizeInBits(AS);
2832 // Ceiling division.
2833 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2834
2835 // If it generates more than 8 stores it is likely to be expanded as an
2836 // inline memcpy so we take that as an upper bound. Otherwise we assume
2837 // one load and one store per word copied.
2838 // FIXME: The maxStoresPerMemcpy setting from the target should be used
2839 // here instead of a magic number of 8, but it's not available via
2840 // DataLayout.
2841 NumStores = std::min(NumStores, 8U);
2842
2843 Cost += 2 * NumStores * InstrCost;
2844 } else {
2845 // For non-byval arguments subtract off one instruction per call
2846 // argument.
2847 Cost += InstrCost;
2848 }
2849 }
2850 // The call instruction also disappears after inlining.
2851 Cost += InstrCost;
2852 Cost += CallPenalty;
2853 return std::min<int64_t>(Cost, INT_MAX);
2854}
2855
2857 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2858 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2859 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2862 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2863 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);
2864}
2865
2867 CallBase &Call, TargetTransformInfo &CalleeTTI,
2868 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2871 const InlineParams Params = {/* DefaultThreshold*/ 0,
2872 /*HintThreshold*/ {},
2873 /*ColdThreshold*/ {},
2874 /*OptSizeThreshold*/ {},
2875 /*OptMinSizeThreshold*/ {},
2876 /*HotCallSiteThreshold*/ {},
2877 /*LocallyHotCallSiteThreshold*/ {},
2878 /*ColdCallSiteThreshold*/ {},
2879 /*ComputeFullInlineCost*/ true,
2880 /*EnableDeferral*/ true};
2881
2882 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
2883 GetAssumptionCache, GetBFI, PSI, ORE, true,
2884 /*IgnoreThreshold*/ true);
2885 auto R = CA.analyze();
2886 if (!R.isSuccess())
2887 return std::nullopt;
2888 return CA.getCost();
2889}
2890
2891std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
2892 CallBase &Call, TargetTransformInfo &CalleeTTI,
2893 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2896 InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI,
2897 ORE, *Call.getCalledFunction(), Call);
2898 auto R = CFA.analyze();
2899 if (!R.isSuccess())
2900 return std::nullopt;
2901 return CFA.features();
2902}
2903
2905 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
2906 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
2907
2908 // Cannot inline indirect calls.
2909 if (!Callee)
2910 return InlineResult::failure("indirect call");
2911
2912 // When callee coroutine function is inlined into caller coroutine function
2913 // before coro-split pass,
2914 // coro-early pass can not handle this quiet well.
2915 // So we won't inline the coroutine function if it have not been unsplited
2916 if (Callee->isPresplitCoroutine())
2917 return InlineResult::failure("unsplited coroutine call");
2918
2919 // Never inline calls with byval arguments that does not have the alloca
2920 // address space. Since byval arguments can be replaced with a copy to an
2921 // alloca, the inlined code would need to be adjusted to handle that the
2922 // argument is in the alloca address space (so it is a little bit complicated
2923 // to solve).
2924 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
2925 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
2926 if (Call.isByValArgument(I)) {
2927 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2928 if (PTy->getAddressSpace() != AllocaAS)
2929 return InlineResult::failure("byval arguments without alloca"
2930 " address space");
2931 }
2932
2933 // Never inline functions with conflicting target attributes.
2934 Function *Caller = Call.getCaller();
2936 !CalleeTTI.areInlineCompatible(Caller, Callee))
2937 return InlineResult::failure("conflicting target attributes");
2938
2939 // Calls to functions with always-inline attributes should be inlined
2940 // whenever possible.
2941 if (Call.hasFnAttr(Attribute::AlwaysInline)) {
2942 if (Call.getAttributes().hasFnAttr(Attribute::NoInline))
2943 return InlineResult::failure("noinline call site attribute");
2944
2945 auto IsViable = isInlineViable(*Callee);
2946 if (IsViable.isSuccess())
2947 return InlineResult::success();
2948 return InlineResult::failure(IsViable.getFailureReason());
2949 }
2950
2951 // Never inline functions with conflicting attributes (unless callee has
2952 // always-inline attribute).
2953 // FIXME: functionsHaveCompatibleAttributes below checks for compatibilities
2954 // of different kinds of function attributes -- sanitizer-related ones,
2955 // checkDenormMode, no-builtin-memcpy, etc. It's unclear if we really want
2956 // the always-inline attribute to take precedence over these different types
2957 // of function attributes.
2958 if (!functionsHaveCompatibleAttributes(Caller, Callee, GetTLI))
2959 return InlineResult::failure("conflicting attributes");
2960
2961 // Don't inline this call if the caller has the optnone attribute.
2962 if (Caller->hasOptNone())
2963 return InlineResult::failure("optnone attribute");
2964
2965 // Don't inline a function that treats null pointer as valid into a caller
2966 // that does not have this attribute.
2967 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
2968 return InlineResult::failure("nullptr definitions incompatible");
2969
2970 // Don't inline functions which can be interposed at link-time.
2971 if (Callee->isInterposable())
2972 return InlineResult::failure("interposable");
2973
2974 // Don't inline functions marked noinline.
2975 if (Callee->hasFnAttribute(Attribute::NoInline))
2976 return InlineResult::failure("noinline function attribute");
2977
2978 // Don't inline call sites marked noinline.
2979 if (Call.isNoInline())
2980 return InlineResult::failure("noinline call site attribute");
2981
2982 return std::nullopt;
2983}
2984
2986 CallBase &Call, Function *Callee, const InlineParams &Params,
2987 TargetTransformInfo &CalleeTTI,
2988 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2989 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2992
2993 auto UserDecision =
2994 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
2995
2996 if (UserDecision) {
2997 if (UserDecision->isSuccess())
2998 return llvm::InlineCost::getAlways("always inline attribute");
2999 return llvm::InlineCost::getNever(UserDecision->getFailureReason());
3000 }
3001
3002 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
3003 << "... (caller:" << Call.getCaller()->getName()
3004 << ")\n");
3005
3006 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3007 GetAssumptionCache, GetBFI, PSI, ORE);
3008 InlineResult ShouldInline = CA.analyze();
3009
3010 LLVM_DEBUG(CA.dump());
3011
3012 // Always make cost benefit based decision explicit.
3013 // We use always/never here since threshold is not meaningful,
3014 // as it's not what drives cost-benefit analysis.
3015 if (CA.wasDecidedByCostBenefit()) {
3016 if (ShouldInline.isSuccess())
3017 return InlineCost::getAlways("benefit over cost",
3018 CA.getCostBenefitPair());
3019 else
3020 return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());
3021 }
3022
3023 if (CA.wasDecidedByCostThreshold())
3024 return InlineCost::get(CA.getCost(), CA.getThreshold(),
3025 CA.getStaticBonusApplied());
3026
3027 // No details on how the decision was made, simply return always or never.
3028 return ShouldInline.isSuccess()
3029 ? InlineCost::getAlways("empty function")
3030 : InlineCost::getNever(ShouldInline.getFailureReason());
3031}
3032
3034 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
3035 for (BasicBlock &BB : F) {
3036 // Disallow inlining of functions which contain indirect branches.
3037 if (isa<IndirectBrInst>(BB.getTerminator()))
3038 return InlineResult::failure("contains indirect branches");
3039
3040 // Disallow inlining of blockaddresses which are used by non-callbr
3041 // instructions.
3042 if (BB.hasAddressTaken())
3043 for (User *U : BlockAddress::get(&BB)->users())
3044 if (!isa<CallBrInst>(*U))
3045 return InlineResult::failure("blockaddress used outside of callbr");
3046
3047 for (auto &II : BB) {
3048 CallBase *Call = dyn_cast<CallBase>(&II);
3049 if (!Call)
3050 continue;
3051
3052 // Disallow recursive calls.
3053 Function *Callee = Call->getCalledFunction();
3054 if (&F == Callee)
3055 return InlineResult::failure("recursive call");
3056
3057 // Disallow calls which expose returns-twice to a function not previously
3058 // attributed as such.
3059 if (!ReturnsTwice && isa<CallInst>(Call) &&
3060 cast<CallInst>(Call)->canReturnTwice())
3061 return InlineResult::failure("exposes returns-twice attribute");
3062
3063 if (Callee)
3064 switch (Callee->getIntrinsicID()) {
3065 default:
3066 break;
3067 case llvm::Intrinsic::icall_branch_funnel:
3068 // Disallow inlining of @llvm.icall.branch.funnel because current
3069 // backend can't separate call targets from call arguments.
3070 return InlineResult::failure(
3071 "disallowed inlining of @llvm.icall.branch.funnel");
3072 case llvm::Intrinsic::localescape:
3073 // Disallow inlining functions that call @llvm.localescape. Doing this
3074 // correctly would require major changes to the inliner.
3075 return InlineResult::failure(
3076 "disallowed inlining of @llvm.localescape");
3077 case llvm::Intrinsic::vastart:
3078 // Disallow inlining of functions that initialize VarArgs with
3079 // va_start.
3080 return InlineResult::failure(
3081 "contains VarArgs initialized with va_start");
3082 }
3083 }
3084 }
3085
3086 return InlineResult::success();
3087}
3088
3089// APIs to create InlineParams based on command line flags and/or other
3090// parameters.
3091
3093 InlineParams Params;
3094
3095 // This field is the threshold to use for a callee by default. This is
3096 // derived from one or more of:
3097 // * optimization or size-optimization levels,
3098 // * a value passed to createFunctionInliningPass function, or
3099 // * the -inline-threshold flag.
3100 // If the -inline-threshold flag is explicitly specified, that is used
3101 // irrespective of anything else.
3102 if (InlineThreshold.getNumOccurrences() > 0)
3104 else
3105 Params.DefaultThreshold = Threshold;
3106
3107 // Set the HintThreshold knob from the -inlinehint-threshold.
3109
3110 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3112
3113 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3114 // populate LocallyHotCallSiteThreshold. Later, we populate
3115 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3116 // we know that optimization level is O3 (in the getInlineParams variant that
3117 // takes the opt and size levels).
3118 // FIXME: Remove this check (and make the assignment unconditional) after
3119 // addressing size regression issues at O2.
3120 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3122
3123 // Set the ColdCallSiteThreshold knob from the
3124 // -inline-cold-callsite-threshold.
3126
3127 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3128 // -inlinehint-threshold commandline option is not explicitly given. If that
3129 // option is present, then its value applies even for callees with size and
3130 // minsize attributes.
3131 // If the -inline-threshold is not specified, set the ColdThreshold from the
3132 // -inlinecold-threshold even if it is not explicitly passed. If
3133 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3134 // explicitly specified to set the ColdThreshold knob
3135 if (InlineThreshold.getNumOccurrences() == 0) {
3139 } else if (ColdThreshold.getNumOccurrences() > 0) {
3141 }
3142 return Params;
3143}
3144
3147}
3148
3149// Compute the default threshold for inlining based on the opt level and the
3150// size opt level.
3151static int computeThresholdFromOptLevels(unsigned OptLevel,
3152 unsigned SizeOptLevel) {
3153 if (OptLevel > 2)
3155 if (SizeOptLevel == 1) // -Os
3157 if (SizeOptLevel == 2) // -Oz
3159 return DefaultThreshold;
3160}
3161
3162InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3163 auto Params =
3164 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3165 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3166 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3167 // when it is specified explicitly.
3168 if (OptLevel > 2)
3170 return Params;
3171}
3172
3177 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3178 [&](Function &F) -> AssumptionCache & {
3180 };
3181 Module *M = F.getParent();
3182 ProfileSummaryInfo PSI(*M);
3183 DataLayout DL(M);
3185 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3186 // In the current implementation, the type of InlineParams doesn't matter as
3187 // the pass serves only for verification of inliner's decisions.
3188 // We can add a flag which determines InlineParams for this run. Right now,
3189 // the default InlineParams are used.
3190 const InlineParams Params = llvm::getInlineParams();
3191 for (BasicBlock &BB : F) {
3192 for (Instruction &I : BB) {
3193 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
3194 Function *CalledFunction = CI->getCalledFunction();
3195 if (!CalledFunction || CalledFunction->isDeclaration())
3196 continue;
3197 OptimizationRemarkEmitter ORE(CalledFunction);
3198 InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI,
3199 GetAssumptionCache, nullptr, &PSI, &ORE);
3200 ICCA.analyze();
3201 OS << " Analyzing call of " << CalledFunction->getName()
3202 << "... (caller:" << CI->getCaller()->getName() << ")\n";
3203 ICCA.print(OS);
3204 OS << "\n";
3205 }
3206 }
3207 }
3208 return PreservedAnalyses::all();
3209}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
SetVector< BasicBlock * > BBSetVector
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:510
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
static bool isColdCallSite(CallBase &CB, BlockFrequencyInfo &CallerBFI)
Return true if the block containing the call site has a BlockFrequency of less than ColdCCRelFreq% of...
Definition: GlobalOpt.cpp:1714
Hexagon Common GEP
iv users
Definition: IVUsers.cpp:48
static cl::opt< int > InlineSavingsMultiplier("inline-savings-multiplier", cl::Hidden, cl::init(8), cl::desc("Multiplier to multiply cycle savings by during inlining"))
static cl::opt< int > InlineThreshold("inline-threshold", cl::Hidden, cl::init(225), cl::desc("Control the amount of inlining to perform (default = 225)"))
static cl::opt< int > CallPenalty("inline-call-penalty", cl::Hidden, cl::init(25), cl::desc("Call penalty that is applied per callsite when inlining"))
static cl::opt< int > HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), cl::desc("Threshold for hot callsites "))
static cl::opt< int > ColdThreshold("inlinecold-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining functions with cold attribute"))
static cl::opt< size_t > RecurStackSizeThreshold("recursive-inline-max-stacksize", cl::Hidden, cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller), cl::desc("Do not inline recursive functions with a stack " "size that exceeds the specified limit"))
static cl::opt< bool > PrintInstructionComments("print-instruction-comments", cl::Hidden, cl::init(false), cl::desc("Prints comments for instruction based on inline cost analysis"))
static cl::opt< int > LocallyHotCallSiteThreshold("locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::desc("Threshold for locally hot callsites "))
static cl::opt< bool > InlineCallerSupersetNoBuiltin("inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true), cl::desc("Allow inlining when caller has a superset of callee's nobuiltin " "attributes."))
static cl::opt< int > HintThreshold("inlinehint-threshold", cl::Hidden, cl::init(325), cl::desc("Threshold for inlining functions with inline hint"))
static cl::opt< size_t > StackSizeThreshold("inline-max-stacksize", cl::Hidden, cl::init(std::numeric_limits< size_t >::max()), cl::desc("Do not inline functions with a stack size " "that exceeds the specified limit"))
static int computeThresholdFromOptLevels(unsigned OptLevel, unsigned SizeOptLevel)
static cl::opt< uint64_t > HotCallSiteRelFreq("hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::desc("Minimum block frequency, expressed as a multiple of caller's " "entry frequency, for a callsite to be hot in the absence of " "profile information."))
static cl::opt< int > MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0), cl::desc("Cost of load/store instruction when inlining"))
static cl::opt< int > ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining cold callsites"))
static cl::opt< bool > IgnoreTTIInlineCompatible("ignore-tti-inline-compatible", cl::Hidden, cl::init(false), cl::desc("Ignore TTI attributes compatibility check between callee/caller " "during inline cost calculation"))
static cl::opt< bool > OptComputeFullInlineCost("inline-cost-full", cl::Hidden, cl::desc("Compute the full inline cost of a call site even when the cost " "exceeds the threshold."))
#define DEBUG_PRINT_STAT(x)
static cl::opt< bool > InlineEnableCostBenefitAnalysis("inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false), cl::desc("Enable the cost-benefit analysis for the inliner"))
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
static cl::opt< int > InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100), cl::desc("The maximum size of a callee that get's " "inlined without sufficient cycle savings"))
#define DEBUG_TYPE
Definition: InlineCost.cpp:52
static cl::opt< int > ColdCallSiteRelFreq("cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::desc("Maximum block frequency, expressed as a percentage of caller's " "entry frequency, for a callsite to be cold in the absence of " "profile information."))
static cl::opt< bool > DisableGEPConstOperand("disable-gep-const-evaluation", cl::Hidden, cl::init(false), cl::desc("Disables evaluation of GetElementPtr with constant operands"))
static bool functionsHaveCompatibleAttributes(Function *Caller, Function *Callee, function_ref< const TargetLibraryInfo &(Function &)> &GetTLI)
Test that there are no attribute conflicts between Caller and Callee that prevent inlining.
static cl::opt< int > DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225), cl::desc("Default amount of inlining to perform"))
static Constant * getFalse(Type *Ty)
For a boolean type or a vector of boolean type, return false or a vector with every element false.
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:81
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
#define P(N)
FunctionAnalysisManager FAM
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:76
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
an instruction to allocate memory on the stack
Definition: Instructions.h:58
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
virtual void emitInstructionAnnot(const Instruction *, formatted_raw_ostream &)
emitInstructionAnnot - This may be implemented to emit a string right before an instruction is emitte...
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:318
AttrKind
This enumeration lists the attributes that can be associated with parameters, function results,...
Definition: Attributes.h:84
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition: Attributes.h:184
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
bool empty() const
Definition: BasicBlock.h:346
bool hasAddressTaken() const
Returns true if there are any uses of this basic block other than direct branches,...
Definition: BasicBlock.h:516
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
This class represents a no-op cast from one type to another.
static BlockAddress * get(Function *F, BasicBlock *BB)
Return a BlockAddress for the specified function and basic block.
Definition: Constants.cpp:1762
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
std::optional< uint64_t > getBlockProfileCount(const BasicBlock *BB, bool AllowSynthetic=false) const
Returns the estimated profile count of BB.
BlockFrequency getBlockFreq(const BasicBlock *BB) const
getblockFreq - Return block frequency.
std::optional< BlockFrequency > mul(uint64_t Factor) const
Multiplies frequency with Factor. Returns nullopt in case of overflow.
Conditional or Unconditional Branch instruction.
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
bool isUnconditional() const
Value * getCondition() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1190
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1412
bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const
Determine whether the argument or parameter has the given attribute.
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1332
Attribute getFnAttr(StringRef Kind) const
Get the attribute of a given kind for the function.
Definition: InstrTypes.h:1635
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1357
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1338
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1270
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:428
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:701
@ ICMP_NE
not equal
Definition: InstrTypes.h:733
static Constant * getSub(Constant *C1, Constant *C2, bool HasNUW=false, bool HasNSW=false)
Definition: Constants.cpp:2573
static Constant * getICmp(unsigned short pred, Constant *LHS, Constant *RHS, bool OnlyIfReduced=false)
get* - Return some common constants without having to specify the full Instruction::OPCODE identifier...
Definition: Constants.cpp:2433
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:833
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:197
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:145
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:136
This is an important base class in LLVM.
Definition: Constant.h:41
bool isAllOnesValue() const
Return true if this is the value that would be returned by getAllOnesValue.
Definition: Constants.cpp:93
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:76
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool erase(const KeyT &Val)
Definition: DenseMap.h:329
unsigned size() const
Definition: DenseMap.h:99
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
This instruction extracts a struct member or array element value from an aggregate value.
Type * getReturnType() const
Definition: DerivedTypes.h:124
Class to represent profile counts.
Definition: Function.h:254
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:940
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:273
Indirect Branch Instruction.
Represents the cost of inlining a function.
Definition: InlineCost.h:89
static InlineCost getNever(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:130
static InlineCost getAlways(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:125
static InlineCost get(int Cost, int Threshold, int StaticBonus=0)
Definition: InlineCost.h:119
InlineResult is basically true or false.
Definition: InlineCost.h:179
static InlineResult success()
Definition: InlineCost.h:184
static InlineResult failure(const char *Reason)
Definition: InlineCost.h:185
bool isSuccess() const
Definition: InlineCost.h:188
const char * getFailureReason() const
Definition: InlineCost.h:189
This instruction inserts a struct field of array element value into an aggregate value.
Base class for instruction visitors.
Definition: InstVisitor.h:78
RetTy visitIndirectBrInst(IndirectBrInst &I)
Definition: InstVisitor.h:235
RetTy visitCmpInst(CmpInst &I)
Definition: InstVisitor.h:262
RetTy visitCallBase(CallBase &I)
Definition: InstVisitor.h:267
RetTy visitCleanupReturnInst(CleanupReturnInst &I)
Definition: InstVisitor.h:244
RetTy visitUnreachableInst(UnreachableInst &I)
Definition: InstVisitor.h:241
RetTy visitSwitchInst(SwitchInst &I)
Definition: InstVisitor.h:232
void visit(Iterator Start, Iterator End)
Definition: InstVisitor.h:87
RetTy visitReturnInst(ReturnInst &I)
Definition: InstVisitor.h:226
RetTy visitBinaryOperator(BinaryOperator &I)
Definition: InstVisitor.h:261
RetTy visitResumeInst(ResumeInst &I)
Definition: InstVisitor.h:238
RetTy visitCatchReturnInst(CatchReturnInst &I)
Definition: InstVisitor.h:247
RetTy visitCastInst(CastInst &I)
Definition: InstVisitor.h:259
RetTy visitBranchInst(BranchInst &I)
Definition: InstVisitor.h:229
RetTy visitSelectInst(SelectInst &I)
Definition: InstVisitor.h:189
void visitInstruction(Instruction &I)
Definition: InstVisitor.h:280
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const BasicBlock * getParent() const
Definition: Instruction.h:90
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:75
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:302
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
This class represents a cast from an integer to a pointer.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Invoke instruction.
An instruction for reading from memory.
Definition: Instructions.h:177
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:41
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Class to represent pointers.
Definition: DerivedTypes.h:643
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:690
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
Analysis providing profile information.
This class represents a cast from a pointer to an integer.
Resume the propagation of an exception.
Return a value (possibly void), from a function.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
size_type size() const
Definition: SmallPtrSet.h:93
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:345
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:384
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:366
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:451
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
void reserve(size_type N)
Definition: SmallVector.h:667
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:474
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition: DataLayout.h:622
TypeSize getElementOffset(unsigned Idx) const
Definition: DataLayout.h:651
Class to represent struct types.
Definition: DerivedTypes.h:213
Multiway switch.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
@ TCK_SizeAndLatency
The weighted sum of size and latency.
unsigned adjustInliningThreshold(const CallBase *CB) const
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
bool isLoweredToCall(const Function *F) const
Test whether calls to a function lower to actual program function calls.
unsigned getInliningThresholdMultiplier() const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getFPOpCost(Type *Ty) const
Return the expected cost of supporting the floating point operation of the specified type.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1069
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
formatted_raw_ostream - A raw_ostream that wraps another one and keeps track of line and column posit...
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
bool areInlineCompatible(const Function &Caller, const Function &Callee)
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
const int ColdccPenalty
Definition: InlineCost.h:51
const char FunctionInlineCostMultiplierAttributeName[]
Definition: InlineCost.h:59
const int OptSizeThreshold
Use when optsize (-Os) is specified.
Definition: InlineCost.h:38
const int OptMinSizeThreshold
Use when minsize (-Oz) is specified.
Definition: InlineCost.h:41
const int LoopPenalty
Definition: InlineCost.h:49
const uint64_t MaxSimplifiedDynamicAllocaToInline
Do not inline dynamic allocas that have been constant propagated to be static allocas above this amou...
Definition: InlineCost.h:57
const int IndirectCallThreshold
Definition: InlineCost.h:48
const int OptAggressiveThreshold
Use when -O3 is specified.
Definition: InlineCost.h:44
const char MaxInlineStackSizeAttributeName[]
Definition: InlineCost.h:62
const int LastCallToStaticBonus
Definition: InlineCost.h:50
const unsigned TotalAllocaSizeRecursiveCaller
Do not inline functions which allocate this many bytes on the stack when the caller is recursive.
Definition: InlineCost.h:54
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
@ Dead
Unused definition.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:440
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1727
Constant * ConstantFoldSelectInstruction(Constant *Cond, Constant *V1, Constant *V2)
Attempt to constant fold a select instruction with the specified operands.
bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
bool canConstantFoldCallTo(const CallBase *Call, const Function *F)
canConstantFoldCallTo - Return true if its even possible to fold a call to the specified function.
Function::ProfileCount ProfileCount
std::optional< int > getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind)
Definition: InlineCost.cpp:175
auto successors(const MachineBasicBlock *BB)
int getCallsiteCost(const CallBase &Call, const DataLayout &DL)
Return the cost associated with a callsite, including parameter passing and the call/return instructi...
Value * lowerObjectSizeCall(IntrinsicInst *ObjectSize, const DataLayout &DL, const TargetLibraryInfo *TLI, bool MustSucceed)
Try to turn a call to @llvm.objectsize into an integer value of the given Type.
std::optional< InlineCostFeatures > getInliningCostFeatures(CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get the expanded cost features.
gep_type_iterator gep_type_end(const User *GEP)
Constant * ConstantFoldCall(const CallBase *Call, Function *F, ArrayRef< Constant * > Operands, const TargetLibraryInfo *TLI=nullptr)
ConstantFoldCall - Attempt to constant fold a call to the specified function with the specified argum...
Value * simplifyInstruction(Instruction *I, const SimplifyQuery &Q)
See if we can compute a simplified version of this instruction.
InlineResult isInlineViable(Function &Callee)
Minimal filter to detect invalid constructs for inlining.
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
Value * simplifyFNegInst(Value *Op, FastMathFlags FMF, const SimplifyQuery &Q)
Given operand for an FNeg, fold the result or return null.
Constant * ConstantFoldInstOperands(Instruction *I, ArrayRef< Constant * > Ops, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr)
ConstantFoldInstOperands - Attempt to constant fold an instruction with the specified operands.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
std::enable_if_t< std::is_unsigned_v< T >, T > SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed=nullptr)
Multiply two unsigned integers, X and Y, and add the unsigned integer, A to the product.
Definition: MathExtras.h:550
std::array< int, static_cast< size_t >(InlineCostFeatureIndex::NumberOfFeatures)> InlineCostFeatures
TargetTransformInfo TTI
std::optional< InlineResult > getAttributeBasedInliningDecision(CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, function_ref< const TargetLibraryInfo &(Function &)> GetTLI)
Returns InlineResult::success() if the call site should be always inlined because of user directives,...
Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
std::optional< int > getInliningCostEstimate(CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get the cost estimate ignoring thresholds.
gep_type_iterator gep_type_begin(const User *GEP)
auto predecessors(const MachineBasicBlock *BB)
std::enable_if_t< std::is_unsigned_v< T >, T > SaturatingAdd(T X, T Y, bool *ResultOverflowed=nullptr)
Add two unsigned integers, X and Y, of type T.
Definition: MathExtras.h:475
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:205
std::optional< int > OptMinSizeThreshold
Threshold to use when the caller is optimized for minsize.
Definition: InlineCost.h:219
std::optional< int > OptSizeThreshold
Threshold to use when the caller is optimized for size.
Definition: InlineCost.h:216
std::optional< int > ColdCallSiteThreshold
Threshold to use when the callsite is considered cold.
Definition: InlineCost.h:229
std::optional< int > ColdThreshold
Threshold to use for cold callees.
Definition: InlineCost.h:213
std::optional< int > HotCallSiteThreshold
Threshold to use when the callsite is considered hot.
Definition: InlineCost.h:222
std::optional< bool > AllowRecursiveCall
Indicate whether we allow inlining for recursive call.
Definition: InlineCost.h:238
int DefaultThreshold
The default threshold to start with for a callee.
Definition: InlineCost.h:207
std::optional< int > HintThreshold
Threshold to use for callees with inline hint.
Definition: InlineCost.h:210
std::optional< int > LocallyHotCallSiteThreshold
Threshold to use when the callsite is considered hot relative to function entry.
Definition: InlineCost.h:226