LLVM 20.0.0git
InlineCost.cpp
Go to the documentation of this file.
1//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements inline cost analysis.
10//
11//===----------------------------------------------------------------------===//
12
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SetVector.h"
18#include "llvm/ADT/Statistic.h"
31#include "llvm/Config/llvm-config.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/DataLayout.h"
35#include "llvm/IR/Dominators.h"
37#include "llvm/IR/GlobalAlias.h"
38#include "llvm/IR/InstVisitor.h"
40#include "llvm/IR/Operator.h"
43#include "llvm/Support/Debug.h"
46#include <climits>
47#include <limits>
48#include <optional>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "inline-cost"
53
54STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
55
56static cl::opt<int>
57 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
58 cl::desc("Default amount of inlining to perform"));
59
60// We introduce this option since there is a minor compile-time win by avoiding
61// addition of TTI attributes (target-features in particular) to inline
62// candidates when they are guaranteed to be the same as top level methods in
63// some use cases. If we avoid adding the attribute, we need an option to avoid
64// checking these attributes.
66 "ignore-tti-inline-compatible", cl::Hidden, cl::init(false),
67 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
68 "during inline cost calculation"));
69
71 "print-instruction-comments", cl::Hidden, cl::init(false),
72 cl::desc("Prints comments for instruction based on inline cost analysis"));
73
75 "inline-threshold", cl::Hidden, cl::init(225),
76 cl::desc("Control the amount of inlining to perform (default = 225)"));
77
79 "inlinehint-threshold", cl::Hidden, cl::init(325),
80 cl::desc("Threshold for inlining functions with inline hint"));
81
82static cl::opt<int>
83 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
84 cl::init(45),
85 cl::desc("Threshold for inlining cold callsites"));
86
88 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
89 cl::desc("Enable the cost-benefit analysis for the inliner"));
90
91// InlineSavingsMultiplier overrides per TTI multipliers iff it is
92// specified explicitly in command line options. This option is exposed
93// for tuning and testing.
95 "inline-savings-multiplier", cl::Hidden, cl::init(8),
96 cl::desc("Multiplier to multiply cycle savings by during inlining"));
97
98// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
99// specified explicitly in command line options. This option is exposed
100// for tuning and testing.
102 "inline-savings-profitable-multiplier", cl::Hidden, cl::init(4),
103 cl::desc("A multiplier on top of cycle savings to decide whether the "
104 "savings won't justify the cost"));
105
106static cl::opt<int>
107 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
108 cl::desc("The maximum size of a callee that get's "
109 "inlined without sufficient cycle savings"));
110
111// We introduce this threshold to help performance of instrumentation based
112// PGO before we actually hook up inliner with analysis passes such as BPI and
113// BFI.
115 "inlinecold-threshold", cl::Hidden, cl::init(45),
116 cl::desc("Threshold for inlining functions with cold attribute"));
117
118static cl::opt<int>
119 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
120 cl::desc("Threshold for hot callsites "));
121
123 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525),
124 cl::desc("Threshold for locally hot callsites "));
125
127 "cold-callsite-rel-freq", cl::Hidden, cl::init(2),
128 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
129 "entry frequency, for a callsite to be cold in the absence of "
130 "profile information."));
131
133 "hot-callsite-rel-freq", cl::Hidden, cl::init(60),
134 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
135 "entry frequency, for a callsite to be hot in the absence of "
136 "profile information."));
137
138static cl::opt<int>
139 InstrCost("inline-instr-cost", cl::Hidden, cl::init(5),
140 cl::desc("Cost of a single instruction when inlining"));
141
142static cl::opt<int>
143 MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0),
144 cl::desc("Cost of load/store instruction when inlining"));
145
147 "inline-call-penalty", cl::Hidden, cl::init(25),
148 cl::desc("Call penalty that is applied per callsite when inlining"));
149
150static cl::opt<size_t>
151 StackSizeThreshold("inline-max-stacksize", cl::Hidden,
152 cl::init(std::numeric_limits<size_t>::max()),
153 cl::desc("Do not inline functions with a stack size "
154 "that exceeds the specified limit"));
155
157 "recursive-inline-max-stacksize", cl::Hidden,
159 cl::desc("Do not inline recursive functions with a stack "
160 "size that exceeds the specified limit"));
161
163 "inline-cost-full", cl::Hidden,
164 cl::desc("Compute the full inline cost of a call site even when the cost "
165 "exceeds the threshold."));
166
168 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
169 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
170 "attributes."));
171
173 "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
174 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
175
176namespace llvm {
177std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
178 if (Attr.isValid()) {
179 int AttrValue = 0;
180 if (!Attr.getValueAsString().getAsInteger(10, AttrValue))
181 return AttrValue;
182 }
183 return std::nullopt;
184}
185
186std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
187 return getStringFnAttrAsInt(CB.getFnAttr(AttrKind));
188}
189
190std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
191 return getStringFnAttrAsInt(F->getFnAttribute(AttrKind));
192}
193
194namespace InlineConstants {
195int getInstrCost() { return InstrCost; }
196
197} // namespace InlineConstants
198
199} // namespace llvm
200
201namespace {
202class InlineCostCallAnalyzer;
203
204// This struct is used to store information about inline cost of a
205// particular instruction
206struct InstructionCostDetail {
207 int CostBefore = 0;
208 int CostAfter = 0;
209 int ThresholdBefore = 0;
210 int ThresholdAfter = 0;
211
212 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
213
214 int getCostDelta() const { return CostAfter - CostBefore; }
215
216 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
217};
218
219class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
220private:
221 InlineCostCallAnalyzer *const ICCA;
222
223public:
224 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
226 formatted_raw_ostream &OS) override;
227};
228
229/// Carry out call site analysis, in order to evaluate inlinability.
230/// NOTE: the type is currently used as implementation detail of functions such
231/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
232/// expectation is that they come from the outer scope, from the wrapper
233/// functions. If we want to support constructing CallAnalyzer objects where
234/// lambdas are provided inline at construction, or where the object needs to
235/// otherwise survive past the scope of the provided functions, we need to
236/// revisit the argument types.
237class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
239 friend class InstVisitor<CallAnalyzer, bool>;
240
241protected:
242 virtual ~CallAnalyzer() = default;
243 /// The TargetTransformInfo available for this compilation.
245
246 /// Getter for the cache of @llvm.assume intrinsics.
247 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
248
249 /// Getter for BlockFrequencyInfo
251
252 /// Profile summary information.
254
255 /// The called function.
256 Function &F;
257
258 // Cache the DataLayout since we use it a lot.
259 const DataLayout &DL;
260
261 /// The OptimizationRemarkEmitter available for this compilation.
263
264 /// The candidate callsite being analyzed. Please do not use this to do
265 /// analysis in the caller function; we want the inline cost query to be
266 /// easily cacheable. Instead, use the cover function paramHasAttr.
267 CallBase &CandidateCall;
268
269 /// Extension points for handling callsite features.
270 // Called before a basic block was analyzed.
271 virtual void onBlockStart(const BasicBlock *BB) {}
272
273 /// Called after a basic block was analyzed.
274 virtual void onBlockAnalyzed(const BasicBlock *BB) {}
275
276 /// Called before an instruction was analyzed
277 virtual void onInstructionAnalysisStart(const Instruction *I) {}
278
279 /// Called after an instruction was analyzed
280 virtual void onInstructionAnalysisFinish(const Instruction *I) {}
281
282 /// Called at the end of the analysis of the callsite. Return the outcome of
283 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
284 /// the reason it can't.
285 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
286 /// Called when we're about to start processing a basic block, and every time
287 /// we are done processing an instruction. Return true if there is no point in
288 /// continuing the analysis (e.g. we've determined already the call site is
289 /// too expensive to inline)
290 virtual bool shouldStop() { return false; }
291
292 /// Called before the analysis of the callee body starts (with callsite
293 /// contexts propagated). It checks callsite-specific information. Return a
294 /// reason analysis can't continue if that's the case, or 'true' if it may
295 /// continue.
296 virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
297 /// Called if the analysis engine decides SROA cannot be done for the given
298 /// alloca.
299 virtual void onDisableSROA(AllocaInst *Arg) {}
300
301 /// Called the analysis engine determines load elimination won't happen.
302 virtual void onDisableLoadElimination() {}
303
304 /// Called when we visit a CallBase, before the analysis starts. Return false
305 /// to stop further processing of the instruction.
306 virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
307
308 /// Called to account for a call.
309 virtual void onCallPenalty() {}
310
311 /// Called to account for a load or store.
312 virtual void onMemAccess(){};
313
314 /// Called to account for the expectation the inlining would result in a load
315 /// elimination.
316 virtual void onLoadEliminationOpportunity() {}
317
318 /// Called to account for the cost of argument setup for the Call in the
319 /// callee's body (not the callsite currently under analysis).
320 virtual void onCallArgumentSetup(const CallBase &Call) {}
321
322 /// Called to account for a load relative intrinsic.
323 virtual void onLoadRelativeIntrinsic() {}
324
325 /// Called to account for a lowered call.
326 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
327 }
328
329 /// Account for a jump table of given size. Return false to stop further
330 /// processing the switch instruction
331 virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
332
333 /// Account for a case cluster of given size. Return false to stop further
334 /// processing of the instruction.
335 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
336
337 /// Called at the end of processing a switch instruction, with the given
338 /// number of case clusters.
339 virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
340 bool DefaultDestUndefined) {}
341
342 /// Called to account for any other instruction not specifically accounted
343 /// for.
344 virtual void onMissedSimplification() {}
345
346 /// Start accounting potential benefits due to SROA for the given alloca.
347 virtual void onInitializeSROAArg(AllocaInst *Arg) {}
348
349 /// Account SROA savings for the AllocaInst value.
350 virtual void onAggregateSROAUse(AllocaInst *V) {}
351
352 bool handleSROA(Value *V, bool DoNotDisable) {
353 // Check for SROA candidates in comparisons.
354 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
355 if (DoNotDisable) {
356 onAggregateSROAUse(SROAArg);
357 return true;
358 }
359 disableSROAForArg(SROAArg);
360 }
361 return false;
362 }
363
364 bool IsCallerRecursive = false;
365 bool IsRecursiveCall = false;
366 bool ExposesReturnsTwice = false;
367 bool HasDynamicAlloca = false;
368 bool ContainsNoDuplicateCall = false;
369 bool HasReturn = false;
370 bool HasIndirectBr = false;
371 bool HasUninlineableIntrinsic = false;
372 bool InitsVargArgs = false;
373
374 /// Number of bytes allocated statically by the callee.
375 uint64_t AllocatedSize = 0;
376 unsigned NumInstructions = 0;
377 unsigned NumVectorInstructions = 0;
378
379 /// While we walk the potentially-inlined instructions, we build up and
380 /// maintain a mapping of simplified values specific to this callsite. The
381 /// idea is to propagate any special information we have about arguments to
382 /// this call through the inlinable section of the function, and account for
383 /// likely simplifications post-inlining. The most important aspect we track
384 /// is CFG altering simplifications -- when we prove a basic block dead, that
385 /// can cause dramatic shifts in the cost of inlining a function.
386 DenseMap<Value *, Constant *> SimplifiedValues;
387
388 /// Keep track of the values which map back (through function arguments) to
389 /// allocas on the caller stack which could be simplified through SROA.
391
392 /// Keep track of Allocas for which we believe we may get SROA optimization.
393 DenseSet<AllocaInst *> EnabledSROAAllocas;
394
395 /// Keep track of values which map to a pointer base and constant offset.
397
398 /// Keep track of dead blocks due to the constant arguments.
400
401 /// The mapping of the blocks to their known unique successors due to the
402 /// constant arguments.
404
405 /// Model the elimination of repeated loads that is expected to happen
406 /// whenever we simplify away the stores that would otherwise cause them to be
407 /// loads.
408 bool EnableLoadElimination = true;
409
410 /// Whether we allow inlining for recursive call.
411 bool AllowRecursiveCall = false;
412
413 SmallPtrSet<Value *, 16> LoadAddrSet;
414
415 AllocaInst *getSROAArgForValueOrNull(Value *V) const {
416 auto It = SROAArgValues.find(V);
417 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
418 return nullptr;
419 return It->second;
420 }
421
422 // Custom simplification helper routines.
423 bool isAllocaDerivedArg(Value *V);
424 void disableSROAForArg(AllocaInst *SROAArg);
425 void disableSROA(Value *V);
426 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
427 void disableLoadElimination();
428 bool isGEPFree(GetElementPtrInst &GEP);
429 bool canFoldInboundsGEP(GetElementPtrInst &I);
430 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
431 bool simplifyCallSite(Function *F, CallBase &Call);
433 bool simplifyIntrinsicCallIsConstant(CallBase &CB);
434 bool simplifyIntrinsicCallObjectSize(CallBase &CB);
435 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
436
437 /// Return true if the given argument to the function being considered for
438 /// inlining has the given attribute set either at the call site or the
439 /// function declaration. Primarily used to inspect call site specific
440 /// attributes since these can be more precise than the ones on the callee
441 /// itself.
442 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
443
444 /// Return true if the given value is known non null within the callee if
445 /// inlined through this particular callsite.
446 bool isKnownNonNullInCallee(Value *V);
447
448 /// Return true if size growth is allowed when inlining the callee at \p Call.
449 bool allowSizeGrowth(CallBase &Call);
450
451 // Custom analysis routines.
452 InlineResult analyzeBlock(BasicBlock *BB,
454
455 // Disable several entry points to the visitor so we don't accidentally use
456 // them by declaring but not defining them here.
457 void visit(Module *);
458 void visit(Module &);
459 void visit(Function *);
460 void visit(Function &);
461 void visit(BasicBlock *);
462 void visit(BasicBlock &);
463
464 // Provide base case for our instruction visit.
466
467 // Our visit overrides.
468 bool visitAlloca(AllocaInst &I);
469 bool visitPHI(PHINode &I);
470 bool visitGetElementPtr(GetElementPtrInst &I);
471 bool visitBitCast(BitCastInst &I);
472 bool visitPtrToInt(PtrToIntInst &I);
473 bool visitIntToPtr(IntToPtrInst &I);
474 bool visitCastInst(CastInst &I);
475 bool visitCmpInst(CmpInst &I);
476 bool visitSub(BinaryOperator &I);
478 bool visitFNeg(UnaryOperator &I);
479 bool visitLoad(LoadInst &I);
480 bool visitStore(StoreInst &I);
481 bool visitExtractValue(ExtractValueInst &I);
482 bool visitInsertValue(InsertValueInst &I);
483 bool visitCallBase(CallBase &Call);
484 bool visitReturnInst(ReturnInst &RI);
485 bool visitBranchInst(BranchInst &BI);
486 bool visitSelectInst(SelectInst &SI);
487 bool visitSwitchInst(SwitchInst &SI);
489 bool visitResumeInst(ResumeInst &RI);
493
494public:
495 CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
496 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
497 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
498 ProfileSummaryInfo *PSI = nullptr,
499 OptimizationRemarkEmitter *ORE = nullptr)
500 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
501 PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE),
502 CandidateCall(Call) {}
503
504 InlineResult analyze();
505
506 std::optional<Constant *> getSimplifiedValue(Instruction *I) {
507 if (SimplifiedValues.contains(I))
508 return SimplifiedValues[I];
509 return std::nullopt;
510 }
511
512 // Keep a bunch of stats about the cost savings found so we can print them
513 // out when debugging.
514 unsigned NumConstantArgs = 0;
515 unsigned NumConstantOffsetPtrArgs = 0;
516 unsigned NumAllocaArgs = 0;
517 unsigned NumConstantPtrCmps = 0;
518 unsigned NumConstantPtrDiffs = 0;
519 unsigned NumInstructionsSimplified = 0;
520
521 void dump();
522};
523
524// Considering forming a binary search, we should find the number of nodes
525// which is same as the number of comparisons when lowered. For a given
526// number of clusters, n, we can define a recursive function, f(n), to find
527// the number of nodes in the tree. The recursion is :
528// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
529// and f(n) = n, when n <= 3.
530// This will lead a binary tree where the leaf should be either f(2) or f(3)
531// when n > 3. So, the number of comparisons from leaves should be n, while
532// the number of non-leaf should be :
533// 2^(log2(n) - 1) - 1
534// = 2^log2(n) * 2^-1 - 1
535// = n / 2 - 1.
536// Considering comparisons from leaf and non-leaf nodes, we can estimate the
537// number of comparisons in a simple closed form :
538// n + n / 2 - 1 = n * 3 / 2 - 1
539int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
540 return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
541}
542
543/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
544/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
545class InlineCostCallAnalyzer final : public CallAnalyzer {
546 const bool ComputeFullInlineCost;
547 int LoadEliminationCost = 0;
548 /// Bonus to be applied when percentage of vector instructions in callee is
549 /// high (see more details in updateThreshold).
550 int VectorBonus = 0;
551 /// Bonus to be applied when the callee has only one reachable basic block.
552 int SingleBBBonus = 0;
553
554 /// Tunable parameters that control the analysis.
555 const InlineParams &Params;
556
557 // This DenseMap stores the delta change in cost and threshold after
558 // accounting for the given instruction. The map is filled only with the
559 // flag PrintInstructionComments on.
561
562 /// Upper bound for the inlining cost. Bonuses are being applied to account
563 /// for speculative "expected profit" of the inlining decision.
564 int Threshold = 0;
565
566 /// The amount of StaticBonus applied.
567 int StaticBonusApplied = 0;
568
569 /// Attempt to evaluate indirect calls to boost its inline cost.
570 const bool BoostIndirectCalls;
571
572 /// Ignore the threshold when finalizing analysis.
573 const bool IgnoreThreshold;
574
575 // True if the cost-benefit-analysis-based inliner is enabled.
576 const bool CostBenefitAnalysisEnabled;
577
578 /// Inlining cost measured in abstract units, accounts for all the
579 /// instructions expected to be executed for a given function invocation.
580 /// Instructions that are statically proven to be dead based on call-site
581 /// arguments are not counted here.
582 int Cost = 0;
583
584 // The cumulative cost at the beginning of the basic block being analyzed. At
585 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
586 // the size of that basic block.
587 int CostAtBBStart = 0;
588
589 // The static size of live but cold basic blocks. This is "static" in the
590 // sense that it's not weighted by profile counts at all.
591 int ColdSize = 0;
592
593 // Whether inlining is decided by cost-threshold analysis.
594 bool DecidedByCostThreshold = false;
595
596 // Whether inlining is decided by cost-benefit analysis.
597 bool DecidedByCostBenefit = false;
598
599 // The cost-benefit pair computed by cost-benefit analysis.
600 std::optional<CostBenefitPair> CostBenefit;
601
602 bool SingleBB = true;
603
604 unsigned SROACostSavings = 0;
605 unsigned SROACostSavingsLost = 0;
606
607 /// The mapping of caller Alloca values to their accumulated cost savings. If
608 /// we have to disable SROA for one of the allocas, this tells us how much
609 /// cost must be added.
610 DenseMap<AllocaInst *, int> SROAArgCosts;
611
612 /// Return true if \p Call is a cold callsite.
613 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
614
615 /// Update Threshold based on callsite properties such as callee
616 /// attributes and callee hotness for PGO builds. The Callee is explicitly
617 /// passed to support analyzing indirect calls whose target is inferred by
618 /// analysis.
619 void updateThreshold(CallBase &Call, Function &Callee);
620 /// Return a higher threshold if \p Call is a hot callsite.
621 std::optional<int> getHotCallSiteThreshold(CallBase &Call,
622 BlockFrequencyInfo *CallerBFI);
623
624 /// Handle a capped 'int' increment for Cost.
625 void addCost(int64_t Inc) {
626 Inc = std::clamp<int64_t>(Inc, INT_MIN, INT_MAX);
627 Cost = std::clamp<int64_t>(Inc + Cost, INT_MIN, INT_MAX);
628 }
629
630 void onDisableSROA(AllocaInst *Arg) override {
631 auto CostIt = SROAArgCosts.find(Arg);
632 if (CostIt == SROAArgCosts.end())
633 return;
634 addCost(CostIt->second);
635 SROACostSavings -= CostIt->second;
636 SROACostSavingsLost += CostIt->second;
637 SROAArgCosts.erase(CostIt);
638 }
639
640 void onDisableLoadElimination() override {
641 addCost(LoadEliminationCost);
642 LoadEliminationCost = 0;
643 }
644
645 bool onCallBaseVisitStart(CallBase &Call) override {
646 if (std::optional<int> AttrCallThresholdBonus =
647 getStringFnAttrAsInt(Call, "call-threshold-bonus"))
648 Threshold += *AttrCallThresholdBonus;
649
650 if (std::optional<int> AttrCallCost =
651 getStringFnAttrAsInt(Call, "call-inline-cost")) {
652 addCost(*AttrCallCost);
653 // Prevent further processing of the call since we want to override its
654 // inline cost, not just add to it.
655 return false;
656 }
657 return true;
658 }
659
660 void onCallPenalty() override { addCost(CallPenalty); }
661
662 void onMemAccess() override { addCost(MemAccessCost); }
663
664 void onCallArgumentSetup(const CallBase &Call) override {
665 // Pay the price of the argument setup. We account for the average 1
666 // instruction per call argument setup here.
667 addCost(Call.arg_size() * InstrCost);
668 }
669 void onLoadRelativeIntrinsic() override {
670 // This is normally lowered to 4 LLVM instructions.
671 addCost(3 * InstrCost);
672 }
673 void onLoweredCall(Function *F, CallBase &Call,
674 bool IsIndirectCall) override {
675 // We account for the average 1 instruction per call argument setup here.
676 addCost(Call.arg_size() * InstrCost);
677
678 // If we have a constant that we are calling as a function, we can peer
679 // through it and see the function target. This happens not infrequently
680 // during devirtualization and so we want to give it a hefty bonus for
681 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
682 // Pretend to inline the function, with a custom threshold.
683 if (IsIndirectCall && BoostIndirectCalls) {
684 auto IndirectCallParams = Params;
685 IndirectCallParams.DefaultThreshold =
687 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
688 /// to instantiate the derived class.
689 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
690 GetAssumptionCache, GetBFI, PSI, ORE, false);
691 if (CA.analyze().isSuccess()) {
692 // We were able to inline the indirect call! Subtract the cost from the
693 // threshold to get the bonus we want to apply, but don't go below zero.
694 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
695 }
696 } else
697 // Otherwise simply add the cost for merely making the call.
698 addCost(TTI.getInlineCallPenalty(CandidateCall.getCaller(), Call,
699 CallPenalty));
700 }
701
702 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
703 bool DefaultDestUndefined) override {
704 // If suitable for a jump table, consider the cost for the table size and
705 // branch to destination.
706 // Maximum valid cost increased in this function.
707 if (JumpTableSize) {
708 // Suppose a default branch includes one compare and one conditional
709 // branch if it's reachable.
710 if (!DefaultDestUndefined)
711 addCost(2 * InstrCost);
712 // Suppose a jump table requires one load and one jump instruction.
713 int64_t JTCost =
714 static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost;
715 addCost(JTCost);
716 return;
717 }
718
719 if (NumCaseCluster <= 3) {
720 // Suppose a comparison includes one compare and one conditional branch.
721 // We can reduce a set of instructions if the default branch is
722 // undefined.
723 addCost((NumCaseCluster - DefaultDestUndefined) * 2 * InstrCost);
724 return;
725 }
726
727 int64_t ExpectedNumberOfCompare =
728 getExpectedNumberOfCompare(NumCaseCluster);
729 int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
730
731 addCost(SwitchCost);
732 }
733 void onMissedSimplification() override { addCost(InstrCost); }
734
735 void onInitializeSROAArg(AllocaInst *Arg) override {
736 assert(Arg != nullptr &&
737 "Should not initialize SROA costs for null value.");
738 auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
739 SROACostSavings += SROAArgCost;
740 SROAArgCosts[Arg] = SROAArgCost;
741 }
742
743 void onAggregateSROAUse(AllocaInst *SROAArg) override {
744 auto CostIt = SROAArgCosts.find(SROAArg);
745 assert(CostIt != SROAArgCosts.end() &&
746 "expected this argument to have a cost");
747 CostIt->second += InstrCost;
748 SROACostSavings += InstrCost;
749 }
750
751 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
752
753 void onBlockAnalyzed(const BasicBlock *BB) override {
754 if (CostBenefitAnalysisEnabled) {
755 // Keep track of the static size of live but cold basic blocks. For now,
756 // we define a cold basic block to be one that's never executed.
757 assert(GetBFI && "GetBFI must be available");
758 BlockFrequencyInfo *BFI = &(GetBFI(F));
759 assert(BFI && "BFI must be available");
760 auto ProfileCount = BFI->getBlockProfileCount(BB);
761 if (*ProfileCount == 0)
762 ColdSize += Cost - CostAtBBStart;
763 }
764
765 auto *TI = BB->getTerminator();
766 // If we had any successors at this point, than post-inlining is likely to
767 // have them as well. Note that we assume any basic blocks which existed
768 // due to branches or switches which folded above will also fold after
769 // inlining.
770 if (SingleBB && TI->getNumSuccessors() > 1) {
771 // Take off the bonus we applied to the threshold.
772 Threshold -= SingleBBBonus;
773 SingleBB = false;
774 }
775 }
776
777 void onInstructionAnalysisStart(const Instruction *I) override {
778 // This function is called to store the initial cost of inlining before
779 // the given instruction was assessed.
781 return;
782 InstructionCostDetailMap[I].CostBefore = Cost;
783 InstructionCostDetailMap[I].ThresholdBefore = Threshold;
784 }
785
786 void onInstructionAnalysisFinish(const Instruction *I) override {
787 // This function is called to find new values of cost and threshold after
788 // the instruction has been assessed.
790 return;
791 InstructionCostDetailMap[I].CostAfter = Cost;
792 InstructionCostDetailMap[I].ThresholdAfter = Threshold;
793 }
794
795 bool isCostBenefitAnalysisEnabled() {
796 if (!PSI || !PSI->hasProfileSummary())
797 return false;
798
799 if (!GetBFI)
800 return false;
801
802 if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
803 // Honor the explicit request from the user.
805 return false;
806 } else {
807 // Otherwise, require instrumentation profile.
808 if (!PSI->hasInstrumentationProfile())
809 return false;
810 }
811
812 auto *Caller = CandidateCall.getParent()->getParent();
813 if (!Caller->getEntryCount())
814 return false;
815
816 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
817 if (!CallerBFI)
818 return false;
819
820 // For now, limit to hot call site.
821 if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
822 return false;
823
824 // Make sure we have a nonzero entry count.
825 auto EntryCount = F.getEntryCount();
826 if (!EntryCount || !EntryCount->getCount())
827 return false;
828
829 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
830 if (!CalleeBFI)
831 return false;
832
833 return true;
834 }
835
836 // A helper function to choose between command line override and default.
837 unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
838 if (InlineSavingsMultiplier.getNumOccurrences())
841 }
842
843 // A helper function to choose between command line override and default.
844 unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
845 if (InlineSavingsProfitableMultiplier.getNumOccurrences())
848 }
849
850 void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
851 if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
852 CandidateCall, "inline-cycle-savings-for-test")) {
853 CycleSavings = *AttrCycleSavings;
854 }
855
856 if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
857 CandidateCall, "inline-runtime-cost-for-test")) {
858 Size = *AttrRuntimeCost;
859 }
860 }
861
862 // Determine whether we should inline the given call site, taking into account
863 // both the size cost and the cycle savings. Return std::nullopt if we don't
864 // have sufficient profiling information to determine.
865 std::optional<bool> costBenefitAnalysis() {
866 if (!CostBenefitAnalysisEnabled)
867 return std::nullopt;
868
869 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
870 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
871 // falling back to the cost-based metric.
872 // TODO: Improve this hacky condition.
873 if (Threshold == 0)
874 return std::nullopt;
875
876 assert(GetBFI);
877 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
878 assert(CalleeBFI);
879
880 // The cycle savings expressed as the sum of InstrCost
881 // multiplied by the estimated dynamic count of each instruction we can
882 // avoid. Savings come from the call site cost, such as argument setup and
883 // the call instruction, as well as the instructions that are folded.
884 //
885 // We use 128-bit APInt here to avoid potential overflow. This variable
886 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
887 // assumes that we can avoid or fold a billion instructions, each with a
888 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
889 // period on a 4GHz machine.
890 APInt CycleSavings(128, 0);
891
892 for (auto &BB : F) {
893 APInt CurrentSavings(128, 0);
894 for (auto &I : BB) {
895 if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
896 // Count a conditional branch as savings if it becomes unconditional.
897 if (BI->isConditional() &&
898 isa_and_nonnull<ConstantInt>(
899 SimplifiedValues.lookup(BI->getCondition()))) {
900 CurrentSavings += InstrCost;
901 }
902 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
903 if (isa_and_present<ConstantInt>(SimplifiedValues.lookup(SI->getCondition())))
904 CurrentSavings += InstrCost;
905 } else if (Value *V = dyn_cast<Value>(&I)) {
906 // Count an instruction as savings if we can fold it.
907 if (SimplifiedValues.count(V)) {
908 CurrentSavings += InstrCost;
909 }
910 }
911 }
912
913 auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
914 CurrentSavings *= *ProfileCount;
915 CycleSavings += CurrentSavings;
916 }
917
918 // Compute the cycle savings per call.
919 auto EntryProfileCount = F.getEntryCount();
920 assert(EntryProfileCount && EntryProfileCount->getCount());
921 auto EntryCount = EntryProfileCount->getCount();
922 CycleSavings += EntryCount / 2;
923 CycleSavings = CycleSavings.udiv(EntryCount);
924
925 // Compute the total savings for the call site.
926 auto *CallerBB = CandidateCall.getParent();
927 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
928 CycleSavings += getCallsiteCost(TTI, this->CandidateCall, DL);
929 CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
930
931 // Remove the cost of the cold basic blocks to model the runtime cost more
932 // accurately. Both machine block placement and function splitting could
933 // place cold blocks further from hot blocks.
934 int Size = Cost - ColdSize;
935
936 // Allow tiny callees to be inlined regardless of whether they meet the
937 // savings threshold.
939
940 OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
941 CostBenefit.emplace(APInt(128, Size), CycleSavings);
942
943 // Let R be the ratio of CycleSavings to Size. We accept the inlining
944 // opportunity if R is really high and reject if R is really low. If R is
945 // somewhere in the middle, we fall back to the cost-based analysis.
946 //
947 // Specifically, let R = CycleSavings / Size, we accept the inlining
948 // opportunity if:
949 //
950 // PSI->getOrCompHotCountThreshold()
951 // R > -------------------------------------------------
952 // getInliningCostBenefitAnalysisSavingsMultiplier()
953 //
954 // and reject the inlining opportunity if:
955 //
956 // PSI->getOrCompHotCountThreshold()
957 // R <= ----------------------------------------------------
958 // getInliningCostBenefitAnalysisProfitableMultiplier()
959 //
960 // Otherwise, we fall back to the cost-based analysis.
961 //
962 // Implementation-wise, use multiplication (CycleSavings * Multiplier,
963 // HotCountThreshold * Size) rather than division to avoid precision loss.
964 APInt Threshold(128, PSI->getOrCompHotCountThreshold());
965 Threshold *= Size;
966
967 APInt UpperBoundCycleSavings = CycleSavings;
968 UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
969 if (UpperBoundCycleSavings.uge(Threshold))
970 return true;
971
972 APInt LowerBoundCycleSavings = CycleSavings;
973 LowerBoundCycleSavings *=
974 getInliningCostBenefitAnalysisProfitableMultiplier();
975 if (LowerBoundCycleSavings.ult(Threshold))
976 return false;
977
978 // Otherwise, fall back to the cost-based analysis.
979 return std::nullopt;
980 }
981
982 InlineResult finalizeAnalysis() override {
983 // Loops generally act a lot like calls in that they act like barriers to
984 // movement, require a certain amount of setup, etc. So when optimising for
985 // size, we penalise any call sites that perform loops. We do this after all
986 // other costs here, so will likely only be dealing with relatively small
987 // functions (and hence DT and LI will hopefully be cheap).
988 auto *Caller = CandidateCall.getFunction();
989 if (Caller->hasMinSize()) {
990 DominatorTree DT(F);
991 LoopInfo LI(DT);
992 int NumLoops = 0;
993 for (Loop *L : LI) {
994 // Ignore loops that will not be executed
995 if (DeadBlocks.count(L->getHeader()))
996 continue;
997 NumLoops++;
998 }
999 addCost(NumLoops * InlineConstants::LoopPenalty);
1000 }
1001
1002 // We applied the maximum possible vector bonus at the beginning. Now,
1003 // subtract the excess bonus, if any, from the Threshold before
1004 // comparing against Cost.
1005 if (NumVectorInstructions <= NumInstructions / 10)
1006 Threshold -= VectorBonus;
1007 else if (NumVectorInstructions <= NumInstructions / 2)
1008 Threshold -= VectorBonus / 2;
1009
1010 if (std::optional<int> AttrCost =
1011 getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
1012 Cost = *AttrCost;
1013
1014 if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1015 CandidateCall,
1017 Cost *= *AttrCostMult;
1018
1019 if (std::optional<int> AttrThreshold =
1020 getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
1021 Threshold = *AttrThreshold;
1022
1023 if (auto Result = costBenefitAnalysis()) {
1024 DecidedByCostBenefit = true;
1025 if (*Result)
1026 return InlineResult::success();
1027 else
1028 return InlineResult::failure("Cost over threshold.");
1029 }
1030
1031 if (IgnoreThreshold)
1032 return InlineResult::success();
1033
1034 DecidedByCostThreshold = true;
1035 return Cost < std::max(1, Threshold)
1037 : InlineResult::failure("Cost over threshold.");
1038 }
1039
1040 bool shouldStop() override {
1041 if (IgnoreThreshold || ComputeFullInlineCost)
1042 return false;
1043 // Bail out the moment we cross the threshold. This means we'll under-count
1044 // the cost, but only when undercounting doesn't matter.
1045 if (Cost < Threshold)
1046 return false;
1047 DecidedByCostThreshold = true;
1048 return true;
1049 }
1050
1051 void onLoadEliminationOpportunity() override {
1052 LoadEliminationCost += InstrCost;
1053 }
1054
1055 InlineResult onAnalysisStart() override {
1056 // Perform some tweaks to the cost and threshold based on the direct
1057 // callsite information.
1058
1059 // We want to more aggressively inline vector-dense kernels, so up the
1060 // threshold, and we'll lower it if the % of vector instructions gets too
1061 // low. Note that these bonuses are some what arbitrary and evolved over
1062 // time by accident as much as because they are principled bonuses.
1063 //
1064 // FIXME: It would be nice to remove all such bonuses. At least it would be
1065 // nice to base the bonus values on something more scientific.
1066 assert(NumInstructions == 0);
1067 assert(NumVectorInstructions == 0);
1068
1069 // Update the threshold based on callsite properties
1070 updateThreshold(CandidateCall, F);
1071
1072 // While Threshold depends on commandline options that can take negative
1073 // values, we want to enforce the invariant that the computed threshold and
1074 // bonuses are non-negative.
1075 assert(Threshold >= 0);
1076 assert(SingleBBBonus >= 0);
1077 assert(VectorBonus >= 0);
1078
1079 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1080 // this Threshold any time, and cost cannot decrease, we can stop processing
1081 // the rest of the function body.
1082 Threshold += (SingleBBBonus + VectorBonus);
1083
1084 // Give out bonuses for the callsite, as the instructions setting them up
1085 // will be gone after inlining.
1086 addCost(-getCallsiteCost(TTI, this->CandidateCall, DL));
1087
1088 // If this function uses the coldcc calling convention, prefer not to inline
1089 // it.
1090 if (F.getCallingConv() == CallingConv::Cold)
1092
1093 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1094
1095 // Check if we're done. This can happen due to bonuses and penalties.
1096 if (Cost >= Threshold && !ComputeFullInlineCost)
1097 return InlineResult::failure("high cost");
1098
1099 return InlineResult::success();
1100 }
1101
1102public:
1103 InlineCostCallAnalyzer(
1104 Function &Callee, CallBase &Call, const InlineParams &Params,
1105 const TargetTransformInfo &TTI,
1106 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1107 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1108 ProfileSummaryInfo *PSI = nullptr,
1109 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1110 bool IgnoreThreshold = false)
1111 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
1112 ComputeFullInlineCost(OptComputeFullInlineCost ||
1113 Params.ComputeFullInlineCost || ORE ||
1114 isCostBenefitAnalysisEnabled()),
1115 Params(Params), Threshold(Params.DefaultThreshold),
1116 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1117 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1118 Writer(this) {
1119 AllowRecursiveCall = *Params.AllowRecursiveCall;
1120 }
1121
1122 /// Annotation Writer for instruction details
1123 InlineCostAnnotationWriter Writer;
1124
1125 void dump();
1126
1127 // Prints the same analysis as dump(), but its definition is not dependent
1128 // on the build.
1129 void print(raw_ostream &OS);
1130
1131 std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1132 if (InstructionCostDetailMap.contains(I))
1133 return InstructionCostDetailMap[I];
1134 return std::nullopt;
1135 }
1136
1137 virtual ~InlineCostCallAnalyzer() = default;
1138 int getThreshold() const { return Threshold; }
1139 int getCost() const { return Cost; }
1140 int getStaticBonusApplied() const { return StaticBonusApplied; }
1141 std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1142 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1143 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1144};
1145
1146// Return true if CB is the sole call to local function Callee.
1147static bool isSoleCallToLocalFunction(const CallBase &CB,
1148 const Function &Callee) {
1149 return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1150 &Callee == CB.getCalledFunction();
1151}
1152
1153class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1154private:
1156
1157 // FIXME: These constants are taken from the heuristic-based cost visitor.
1158 // These should be removed entirely in a later revision to avoid reliance on
1159 // heuristics in the ML inliner.
1160 static constexpr int JTCostMultiplier = 2;
1161 static constexpr int CaseClusterCostMultiplier = 2;
1162 static constexpr int SwitchDefaultDestCostMultiplier = 2;
1163 static constexpr int SwitchCostMultiplier = 2;
1164
1165 // FIXME: These are taken from the heuristic-based cost visitor: we should
1166 // eventually abstract these to the CallAnalyzer to avoid duplication.
1167 unsigned SROACostSavingOpportunities = 0;
1168 int VectorBonus = 0;
1169 int SingleBBBonus = 0;
1170 int Threshold = 5;
1171
1173
1174 void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1175 Cost[static_cast<size_t>(Feature)] += Delta;
1176 }
1177
1178 void set(InlineCostFeatureIndex Feature, int64_t Value) {
1179 Cost[static_cast<size_t>(Feature)] = Value;
1180 }
1181
1182 void onDisableSROA(AllocaInst *Arg) override {
1183 auto CostIt = SROACosts.find(Arg);
1184 if (CostIt == SROACosts.end())
1185 return;
1186
1187 increment(InlineCostFeatureIndex::sroa_losses, CostIt->second);
1188 SROACostSavingOpportunities -= CostIt->second;
1189 SROACosts.erase(CostIt);
1190 }
1191
1192 void onDisableLoadElimination() override {
1193 set(InlineCostFeatureIndex::load_elimination, 1);
1194 }
1195
1196 void onCallPenalty() override {
1197 increment(InlineCostFeatureIndex::call_penalty, CallPenalty);
1198 }
1199
1200 void onCallArgumentSetup(const CallBase &Call) override {
1201 increment(InlineCostFeatureIndex::call_argument_setup,
1202 Call.arg_size() * InstrCost);
1203 }
1204
1205 void onLoadRelativeIntrinsic() override {
1206 increment(InlineCostFeatureIndex::load_relative_intrinsic, 3 * InstrCost);
1207 }
1208
1209 void onLoweredCall(Function *F, CallBase &Call,
1210 bool IsIndirectCall) override {
1211 increment(InlineCostFeatureIndex::lowered_call_arg_setup,
1212 Call.arg_size() * InstrCost);
1213
1214 if (IsIndirectCall) {
1215 InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1216 /*HintThreshold*/ {},
1217 /*ColdThreshold*/ {},
1218 /*OptSizeThreshold*/ {},
1219 /*OptMinSizeThreshold*/ {},
1220 /*HotCallSiteThreshold*/ {},
1221 /*LocallyHotCallSiteThreshold*/ {},
1222 /*ColdCallSiteThreshold*/ {},
1223 /*ComputeFullInlineCost*/ true,
1224 /*EnableDeferral*/ true};
1225 IndirectCallParams.DefaultThreshold =
1227
1228 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1229 GetAssumptionCache, GetBFI, PSI, ORE, false,
1230 true);
1231 if (CA.analyze().isSuccess()) {
1232 increment(InlineCostFeatureIndex::nested_inline_cost_estimate,
1233 CA.getCost());
1234 increment(InlineCostFeatureIndex::nested_inlines, 1);
1235 }
1236 } else {
1237 onCallPenalty();
1238 }
1239 }
1240
1241 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
1242 bool DefaultDestUndefined) override {
1243 if (JumpTableSize) {
1244 if (!DefaultDestUndefined)
1245 increment(InlineCostFeatureIndex::switch_default_dest_penalty,
1246 SwitchDefaultDestCostMultiplier * InstrCost);
1247 int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1248 JTCostMultiplier * InstrCost;
1249 increment(InlineCostFeatureIndex::jump_table_penalty, JTCost);
1250 return;
1251 }
1252
1253 if (NumCaseCluster <= 3) {
1254 increment(InlineCostFeatureIndex::case_cluster_penalty,
1255 (NumCaseCluster - DefaultDestUndefined) *
1256 CaseClusterCostMultiplier * InstrCost);
1257 return;
1258 }
1259
1260 int64_t ExpectedNumberOfCompare =
1261 getExpectedNumberOfCompare(NumCaseCluster);
1262
1263 int64_t SwitchCost =
1264 ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1265 increment(InlineCostFeatureIndex::switch_penalty, SwitchCost);
1266 }
1267
1268 void onMissedSimplification() override {
1269 increment(InlineCostFeatureIndex::unsimplified_common_instructions,
1270 InstrCost);
1271 }
1272
1273 void onInitializeSROAArg(AllocaInst *Arg) override {
1274 auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
1275 SROACosts[Arg] = SROAArgCost;
1276 SROACostSavingOpportunities += SROAArgCost;
1277 }
1278
1279 void onAggregateSROAUse(AllocaInst *Arg) override {
1280 SROACosts.find(Arg)->second += InstrCost;
1281 SROACostSavingOpportunities += InstrCost;
1282 }
1283
1284 void onBlockAnalyzed(const BasicBlock *BB) override {
1285 if (BB->getTerminator()->getNumSuccessors() > 1)
1286 set(InlineCostFeatureIndex::is_multiple_blocks, 1);
1287 Threshold -= SingleBBBonus;
1288 }
1289
1290 InlineResult finalizeAnalysis() override {
1291 auto *Caller = CandidateCall.getFunction();
1292 if (Caller->hasMinSize()) {
1293 DominatorTree DT(F);
1294 LoopInfo LI(DT);
1295 for (Loop *L : LI) {
1296 // Ignore loops that will not be executed
1297 if (DeadBlocks.count(L->getHeader()))
1298 continue;
1299 increment(InlineCostFeatureIndex::num_loops,
1301 }
1302 }
1303 set(InlineCostFeatureIndex::dead_blocks, DeadBlocks.size());
1304 set(InlineCostFeatureIndex::simplified_instructions,
1305 NumInstructionsSimplified);
1306 set(InlineCostFeatureIndex::constant_args, NumConstantArgs);
1307 set(InlineCostFeatureIndex::constant_offset_ptr_args,
1308 NumConstantOffsetPtrArgs);
1309 set(InlineCostFeatureIndex::sroa_savings, SROACostSavingOpportunities);
1310
1311 if (NumVectorInstructions <= NumInstructions / 10)
1312 Threshold -= VectorBonus;
1313 else if (NumVectorInstructions <= NumInstructions / 2)
1314 Threshold -= VectorBonus / 2;
1315
1316 set(InlineCostFeatureIndex::threshold, Threshold);
1317
1318 return InlineResult::success();
1319 }
1320
1321 bool shouldStop() override { return false; }
1322
1323 void onLoadEliminationOpportunity() override {
1324 increment(InlineCostFeatureIndex::load_elimination, 1);
1325 }
1326
1327 InlineResult onAnalysisStart() override {
1328 increment(InlineCostFeatureIndex::callsite_cost,
1329 -1 * getCallsiteCost(TTI, this->CandidateCall, DL));
1330
1331 set(InlineCostFeatureIndex::cold_cc_penalty,
1332 (F.getCallingConv() == CallingConv::Cold));
1333
1334 set(InlineCostFeatureIndex::last_call_to_static_bonus,
1335 isSoleCallToLocalFunction(CandidateCall, F));
1336
1337 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1338 // analyzer - instead, we should abstract it to a common method in the
1339 // CallAnalyzer
1340 int SingleBBBonusPercent = 50;
1341 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1342 Threshold += TTI.adjustInliningThreshold(&CandidateCall);
1343 Threshold *= TTI.getInliningThresholdMultiplier();
1344 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1345 VectorBonus = Threshold * VectorBonusPercent / 100;
1346 Threshold += (SingleBBBonus + VectorBonus);
1347
1348 return InlineResult::success();
1349 }
1350
1351public:
1352 InlineCostFeaturesAnalyzer(
1353 const TargetTransformInfo &TTI,
1354 function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1357 CallBase &Call)
1358 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {}
1359
1360 const InlineCostFeatures &features() const { return Cost; }
1361};
1362
1363} // namespace
1364
1365/// Test whether the given value is an Alloca-derived function argument.
1366bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1367 return SROAArgValues.count(V);
1368}
1369
1370void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1371 onDisableSROA(SROAArg);
1372 EnabledSROAAllocas.erase(SROAArg);
1373 disableLoadElimination();
1374}
1375
1376void InlineCostAnnotationWriter::emitInstructionAnnot(
1378 // The cost of inlining of the given instruction is printed always.
1379 // The threshold delta is printed only when it is non-zero. It happens
1380 // when we decided to give a bonus at a particular instruction.
1381 std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1382 if (!Record)
1383 OS << "; No analysis for the instruction";
1384 else {
1385 OS << "; cost before = " << Record->CostBefore
1386 << ", cost after = " << Record->CostAfter
1387 << ", threshold before = " << Record->ThresholdBefore
1388 << ", threshold after = " << Record->ThresholdAfter << ", ";
1389 OS << "cost delta = " << Record->getCostDelta();
1390 if (Record->hasThresholdChanged())
1391 OS << ", threshold delta = " << Record->getThresholdDelta();
1392 }
1393 auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
1394 if (C) {
1395 OS << ", simplified to ";
1396 (*C)->print(OS, true);
1397 }
1398 OS << "\n";
1399}
1400
1401/// If 'V' maps to a SROA candidate, disable SROA for it.
1402void CallAnalyzer::disableSROA(Value *V) {
1403 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1404 disableSROAForArg(SROAArg);
1405 }
1406}
1407
1408void CallAnalyzer::disableLoadElimination() {
1409 if (EnableLoadElimination) {
1410 onDisableLoadElimination();
1411 EnableLoadElimination = false;
1412 }
1413}
1414
1415/// Accumulate a constant GEP offset into an APInt if possible.
1416///
1417/// Returns false if unable to compute the offset for any reason. Respects any
1418/// simplified values known during the analysis of this callsite.
1419bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1420 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
1421 assert(IntPtrWidth == Offset.getBitWidth());
1422
1424 GTI != GTE; ++GTI) {
1425 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
1426 if (!OpC)
1427 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
1428 OpC = dyn_cast<ConstantInt>(SimpleOp);
1429 if (!OpC)
1430 return false;
1431 if (OpC->isZero())
1432 continue;
1433
1434 // Handle a struct index, which adds its field offset to the pointer.
1435 if (StructType *STy = GTI.getStructTypeOrNull()) {
1436 unsigned ElementIdx = OpC->getZExtValue();
1437 const StructLayout *SL = DL.getStructLayout(STy);
1438 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
1439 continue;
1440 }
1441
1442 APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
1443 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
1444 }
1445 return true;
1446}
1447
1448/// Use TTI to check whether a GEP is free.
1449///
1450/// Respects any simplified values known during the analysis of this callsite.
1451bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1453 Operands.push_back(GEP.getOperand(0));
1454 for (const Use &Op : GEP.indices())
1455 if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
1456 Operands.push_back(SimpleOp);
1457 else
1458 Operands.push_back(Op);
1462}
1463
1464bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1465 disableSROA(I.getOperand(0));
1466
1467 // Check whether inlining will turn a dynamic alloca into a static
1468 // alloca and handle that case.
1469 if (I.isArrayAllocation()) {
1470 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
1471 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
1472 // Sometimes a dynamic alloca could be converted into a static alloca
1473 // after this constant prop, and become a huge static alloca on an
1474 // unconditional CFG path. Avoid inlining if this is going to happen above
1475 // a threshold.
1476 // FIXME: If the threshold is removed or lowered too much, we could end up
1477 // being too pessimistic and prevent inlining non-problematic code. This
1478 // could result in unintended perf regressions. A better overall strategy
1479 // is needed to track stack usage during inlining.
1480 Type *Ty = I.getAllocatedType();
1481 AllocatedSize = SaturatingMultiplyAdd(
1482 AllocSize->getLimitedValue(),
1483 DL.getTypeAllocSize(Ty).getKnownMinValue(), AllocatedSize);
1485 HasDynamicAlloca = true;
1486 return false;
1487 }
1488 }
1489
1490 // Accumulate the allocated size.
1491 if (I.isStaticAlloca()) {
1492 Type *Ty = I.getAllocatedType();
1493 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinValue(),
1494 AllocatedSize);
1495 }
1496
1497 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1498 // a variety of reasons, and so we would like to not inline them into
1499 // functions which don't currently have a dynamic alloca. This simply
1500 // disables inlining altogether in the presence of a dynamic alloca.
1501 if (!I.isStaticAlloca())
1502 HasDynamicAlloca = true;
1503
1504 return false;
1505}
1506
1507bool CallAnalyzer::visitPHI(PHINode &I) {
1508 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1509 // though we don't want to propagate it's bonuses. The idea is to disable
1510 // SROA if it *might* be used in an inappropriate manner.
1511
1512 // Phi nodes are always zero-cost.
1513 // FIXME: Pointer sizes may differ between different address spaces, so do we
1514 // need to use correct address space in the call to getPointerSizeInBits here?
1515 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1516 // see the ZeroOffset is used as a dummy value, so we can probably use any
1517 // bit width for the ZeroOffset?
1518 APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));
1519 bool CheckSROA = I.getType()->isPointerTy();
1520
1521 // Track the constant or pointer with constant offset we've seen so far.
1522 Constant *FirstC = nullptr;
1523 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1524 Value *FirstV = nullptr;
1525
1526 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1527 BasicBlock *Pred = I.getIncomingBlock(i);
1528 // If the incoming block is dead, skip the incoming block.
1529 if (DeadBlocks.count(Pred))
1530 continue;
1531 // If the parent block of phi is not the known successor of the incoming
1532 // block, skip the incoming block.
1533 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1534 if (KnownSuccessor && KnownSuccessor != I.getParent())
1535 continue;
1536
1537 Value *V = I.getIncomingValue(i);
1538 // If the incoming value is this phi itself, skip the incoming value.
1539 if (&I == V)
1540 continue;
1541
1542 Constant *C = dyn_cast<Constant>(V);
1543 if (!C)
1544 C = SimplifiedValues.lookup(V);
1545
1546 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1547 if (!C && CheckSROA)
1548 BaseAndOffset = ConstantOffsetPtrs.lookup(V);
1549
1550 if (!C && !BaseAndOffset.first)
1551 // The incoming value is neither a constant nor a pointer with constant
1552 // offset, exit early.
1553 return true;
1554
1555 if (FirstC) {
1556 if (FirstC == C)
1557 // If we've seen a constant incoming value before and it is the same
1558 // constant we see this time, continue checking the next incoming value.
1559 continue;
1560 // Otherwise early exit because we either see a different constant or saw
1561 // a constant before but we have a pointer with constant offset this time.
1562 return true;
1563 }
1564
1565 if (FirstV) {
1566 // The same logic as above, but check pointer with constant offset here.
1567 if (FirstBaseAndOffset == BaseAndOffset)
1568 continue;
1569 return true;
1570 }
1571
1572 if (C) {
1573 // This is the 1st time we've seen a constant, record it.
1574 FirstC = C;
1575 continue;
1576 }
1577
1578 // The remaining case is that this is the 1st time we've seen a pointer with
1579 // constant offset, record it.
1580 FirstV = V;
1581 FirstBaseAndOffset = BaseAndOffset;
1582 }
1583
1584 // Check if we can map phi to a constant.
1585 if (FirstC) {
1586 SimplifiedValues[&I] = FirstC;
1587 return true;
1588 }
1589
1590 // Check if we can map phi to a pointer with constant offset.
1591 if (FirstBaseAndOffset.first) {
1592 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1593
1594 if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
1595 SROAArgValues[&I] = SROAArg;
1596 }
1597
1598 return true;
1599}
1600
1601/// Check we can fold GEPs of constant-offset call site argument pointers.
1602/// This requires target data and inbounds GEPs.
1603///
1604/// \return true if the specified GEP can be folded.
1605bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1606 // Check if we have a base + offset for the pointer.
1607 std::pair<Value *, APInt> BaseAndOffset =
1608 ConstantOffsetPtrs.lookup(I.getPointerOperand());
1609 if (!BaseAndOffset.first)
1610 return false;
1611
1612 // Check if the offset of this GEP is constant, and if so accumulate it
1613 // into Offset.
1614 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
1615 return false;
1616
1617 // Add the result as a new mapping to Base + Offset.
1618 ConstantOffsetPtrs[&I] = BaseAndOffset;
1619
1620 return true;
1621}
1622
1623bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1624 auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
1625
1626 // Lambda to check whether a GEP's indices are all constant.
1627 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1628 for (const Use &Op : GEP.indices())
1629 if (!isa<Constant>(Op) && !SimplifiedValues.lookup(Op))
1630 return false;
1631 return true;
1632 };
1633
1636 return true;
1637
1638 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1639 if (SROAArg)
1640 SROAArgValues[&I] = SROAArg;
1641
1642 // Constant GEPs are modeled as free.
1643 return true;
1644 }
1645
1646 // Variable GEPs will require math and will disable SROA.
1647 if (SROAArg)
1648 disableSROAForArg(SROAArg);
1649 return isGEPFree(I);
1650}
1651
1652/// Simplify \p I if its operands are constants and update SimplifiedValues.
1653bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1655 for (Value *Op : I.operands()) {
1656 Constant *COp = dyn_cast<Constant>(Op);
1657 if (!COp)
1658 COp = SimplifiedValues.lookup(Op);
1659 if (!COp)
1660 return false;
1661 COps.push_back(COp);
1662 }
1663 auto *C = ConstantFoldInstOperands(&I, COps, DL);
1664 if (!C)
1665 return false;
1666 SimplifiedValues[&I] = C;
1667 return true;
1668}
1669
1670/// Try to simplify a call to llvm.is.constant.
1671///
1672/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1673/// we expect calls of this specific intrinsic to be infrequent.
1674///
1675/// FIXME: Given that we know CB's parent (F) caller
1676/// (CandidateCall->getParent()->getParent()), we might be able to determine
1677/// whether inlining F into F's caller would change how the call to
1678/// llvm.is.constant would evaluate.
1679bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1680 Value *Arg = CB.getArgOperand(0);
1681 auto *C = dyn_cast<Constant>(Arg);
1682
1683 if (!C)
1684 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(Arg));
1685
1686 Type *RT = CB.getFunctionType()->getReturnType();
1687 SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);
1688 return true;
1689}
1690
1691bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1692 // As per the langref, "The fourth argument to llvm.objectsize determines if
1693 // the value should be evaluated at runtime."
1694 if (cast<ConstantInt>(CB.getArgOperand(3))->isOne())
1695 return false;
1696
1697 Value *V = lowerObjectSizeCall(&cast<IntrinsicInst>(CB), DL, nullptr,
1698 /*MustSucceed=*/true);
1699 Constant *C = dyn_cast_or_null<Constant>(V);
1700 if (C)
1701 SimplifiedValues[&CB] = C;
1702 return C;
1703}
1704
1705bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1706 // Propagate constants through bitcasts.
1708 return true;
1709
1710 // Track base/offsets through casts
1711 std::pair<Value *, APInt> BaseAndOffset =
1712 ConstantOffsetPtrs.lookup(I.getOperand(0));
1713 // Casts don't change the offset, just wrap it up.
1714 if (BaseAndOffset.first)
1715 ConstantOffsetPtrs[&I] = BaseAndOffset;
1716
1717 // Also look for SROA candidates here.
1718 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1719 SROAArgValues[&I] = SROAArg;
1720
1721 // Bitcasts are always zero cost.
1722 return true;
1723}
1724
1725bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1726 // Propagate constants through ptrtoint.
1728 return true;
1729
1730 // Track base/offset pairs when converted to a plain integer provided the
1731 // integer is large enough to represent the pointer.
1732 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1733 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1734 if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1735 std::pair<Value *, APInt> BaseAndOffset =
1736 ConstantOffsetPtrs.lookup(I.getOperand(0));
1737 if (BaseAndOffset.first)
1738 ConstantOffsetPtrs[&I] = BaseAndOffset;
1739 }
1740
1741 // This is really weird. Technically, ptrtoint will disable SROA. However,
1742 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1743 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1744 // would block SROA would also block SROA if applied directly to a pointer,
1745 // and so we can just add the integer in here. The only places where SROA is
1746 // preserved either cannot fire on an integer, or won't in-and-of themselves
1747 // disable SROA (ext) w/o some later use that we would see and disable.
1748 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1749 SROAArgValues[&I] = SROAArg;
1750
1753}
1754
1755bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1756 // Propagate constants through ptrtoint.
1758 return true;
1759
1760 // Track base/offset pairs when round-tripped through a pointer without
1761 // modifications provided the integer is not too large.
1762 Value *Op = I.getOperand(0);
1763 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1764 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1765 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1766 if (BaseAndOffset.first)
1767 ConstantOffsetPtrs[&I] = BaseAndOffset;
1768 }
1769
1770 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1771 if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1772 SROAArgValues[&I] = SROAArg;
1773
1776}
1777
1778bool CallAnalyzer::visitCastInst(CastInst &I) {
1779 // Propagate constants through casts.
1781 return true;
1782
1783 // Disable SROA in the face of arbitrary casts we don't explicitly list
1784 // elsewhere.
1785 disableSROA(I.getOperand(0));
1786
1787 // If this is a floating-point cast, and the target says this operation
1788 // is expensive, this may eventually become a library call. Treat the cost
1789 // as such.
1790 switch (I.getOpcode()) {
1791 case Instruction::FPTrunc:
1792 case Instruction::FPExt:
1793 case Instruction::UIToFP:
1794 case Instruction::SIToFP:
1795 case Instruction::FPToUI:
1796 case Instruction::FPToSI:
1798 onCallPenalty();
1799 break;
1800 default:
1801 break;
1802 }
1803
1806}
1807
1808bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1809 return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1810}
1811
1812bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1813 // Does the *call site* have the NonNull attribute set on an argument? We
1814 // use the attribute on the call site to memoize any analysis done in the
1815 // caller. This will also trip if the callee function has a non-null
1816 // parameter attribute, but that's a less interesting case because hopefully
1817 // the callee would already have been simplified based on that.
1818 if (Argument *A = dyn_cast<Argument>(V))
1819 if (paramHasAttr(A, Attribute::NonNull))
1820 return true;
1821
1822 // Is this an alloca in the caller? This is distinct from the attribute case
1823 // above because attributes aren't updated within the inliner itself and we
1824 // always want to catch the alloca derived case.
1825 if (isAllocaDerivedArg(V))
1826 // We can actually predict the result of comparisons between an
1827 // alloca-derived value and null. Note that this fires regardless of
1828 // SROA firing.
1829 return true;
1830
1831 return false;
1832}
1833
1834bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1835 // If the normal destination of the invoke or the parent block of the call
1836 // site is unreachable-terminated, there is little point in inlining this
1837 // unless there is literally zero cost.
1838 // FIXME: Note that it is possible that an unreachable-terminated block has a
1839 // hot entry. For example, in below scenario inlining hot_call_X() may be
1840 // beneficial :
1841 // main() {
1842 // hot_call_1();
1843 // ...
1844 // hot_call_N()
1845 // exit(0);
1846 // }
1847 // For now, we are not handling this corner case here as it is rare in real
1848 // code. In future, we should elaborate this based on BPI and BFI in more
1849 // general threshold adjusting heuristics in updateThreshold().
1850 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
1851 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
1852 return false;
1853 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
1854 return false;
1855
1856 return true;
1857}
1858
1859bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1860 BlockFrequencyInfo *CallerBFI) {
1861 // If global profile summary is available, then callsite's coldness is
1862 // determined based on that.
1863 if (PSI && PSI->hasProfileSummary())
1864 return PSI->isColdCallSite(Call, CallerBFI);
1865
1866 // Otherwise we need BFI to be available.
1867 if (!CallerBFI)
1868 return false;
1869
1870 // Determine if the callsite is cold relative to caller's entry. We could
1871 // potentially cache the computation of scaled entry frequency, but the added
1872 // complexity is not worth it unless this scaling shows up high in the
1873 // profiles.
1874 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1875 auto CallSiteBB = Call.getParent();
1876 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1877 auto CallerEntryFreq =
1878 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
1879 return CallSiteFreq < CallerEntryFreq * ColdProb;
1880}
1881
1882std::optional<int>
1883InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1884 BlockFrequencyInfo *CallerBFI) {
1885
1886 // If global profile summary is available, then callsite's hotness is
1887 // determined based on that.
1888 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
1889 return Params.HotCallSiteThreshold;
1890
1891 // Otherwise we need BFI to be available and to have a locally hot callsite
1892 // threshold.
1893 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1894 return std::nullopt;
1895
1896 // Determine if the callsite is hot relative to caller's entry. We could
1897 // potentially cache the computation of scaled entry frequency, but the added
1898 // complexity is not worth it unless this scaling shows up high in the
1899 // profiles.
1900 const BasicBlock *CallSiteBB = Call.getParent();
1901 BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1902 BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
1903 std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(HotCallSiteRelFreq);
1904 if (Limit && CallSiteFreq >= *Limit)
1905 return Params.LocallyHotCallSiteThreshold;
1906
1907 // Otherwise treat it normally.
1908 return std::nullopt;
1909}
1910
1911void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1912 // If no size growth is allowed for this inlining, set Threshold to 0.
1913 if (!allowSizeGrowth(Call)) {
1914 Threshold = 0;
1915 return;
1916 }
1917
1918 Function *Caller = Call.getCaller();
1919
1920 // return min(A, B) if B is valid.
1921 auto MinIfValid = [](int A, std::optional<int> B) {
1922 return B ? std::min(A, *B) : A;
1923 };
1924
1925 // return max(A, B) if B is valid.
1926 auto MaxIfValid = [](int A, std::optional<int> B) {
1927 return B ? std::max(A, *B) : A;
1928 };
1929
1930 // Various bonus percentages. These are multiplied by Threshold to get the
1931 // bonus values.
1932 // SingleBBBonus: This bonus is applied if the callee has a single reachable
1933 // basic block at the given callsite context. This is speculatively applied
1934 // and withdrawn if more than one basic block is seen.
1935 //
1936 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1937 // of the last call to a static function as inlining such functions is
1938 // guaranteed to reduce code size.
1939 //
1940 // These bonus percentages may be set to 0 based on properties of the caller
1941 // and the callsite.
1942 int SingleBBBonusPercent = 50;
1943 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1945
1946 // Lambda to set all the above bonus and bonus percentages to 0.
1947 auto DisallowAllBonuses = [&]() {
1948 SingleBBBonusPercent = 0;
1949 VectorBonusPercent = 0;
1951 };
1952
1953 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1954 // and reduce the threshold if the caller has the necessary attribute.
1955 if (Caller->hasMinSize()) {
1956 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1957 // For minsize, we want to disable the single BB bonus and the vector
1958 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1959 // a static function will, at the minimum, eliminate the parameter setup and
1960 // call/return instructions.
1961 SingleBBBonusPercent = 0;
1962 VectorBonusPercent = 0;
1963 } else if (Caller->hasOptSize())
1964 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1965
1966 // Adjust the threshold based on inlinehint attribute and profile based
1967 // hotness information if the caller does not have MinSize attribute.
1968 if (!Caller->hasMinSize()) {
1969 if (Callee.hasFnAttribute(Attribute::InlineHint))
1970 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1971
1972 // FIXME: After switching to the new passmanager, simplify the logic below
1973 // by checking only the callsite hotness/coldness as we will reliably
1974 // have local profile information.
1975 //
1976 // Callsite hotness and coldness can be determined if sample profile is
1977 // used (which adds hotness metadata to calls) or if caller's
1978 // BlockFrequencyInfo is available.
1979 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
1980 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1981 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1982 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1983 // FIXME: This should update the threshold only if it exceeds the
1984 // current threshold, but AutoFDO + ThinLTO currently relies on this
1985 // behavior to prevent inlining of hot callsites during ThinLTO
1986 // compile phase.
1987 Threshold = *HotCallSiteThreshold;
1988 } else if (isColdCallSite(Call, CallerBFI)) {
1989 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1990 // Do not apply bonuses for a cold callsite including the
1991 // LastCallToStatic bonus. While this bonus might result in code size
1992 // reduction, it can cause the size of a non-cold caller to increase
1993 // preventing it from being inlined.
1994 DisallowAllBonuses();
1995 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
1996 } else if (PSI) {
1997 // Use callee's global profile information only if we have no way of
1998 // determining this via callsite information.
1999 if (PSI->isFunctionEntryHot(&Callee)) {
2000 LLVM_DEBUG(dbgs() << "Hot callee.\n");
2001 // If callsite hotness can not be determined, we may still know
2002 // that the callee is hot and treat it as a weaker hint for threshold
2003 // increase.
2004 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2005 } else if (PSI->isFunctionEntryCold(&Callee)) {
2006 LLVM_DEBUG(dbgs() << "Cold callee.\n");
2007 // Do not apply bonuses for a cold callee including the
2008 // LastCallToStatic bonus. While this bonus might result in code size
2009 // reduction, it can cause the size of a non-cold caller to increase
2010 // preventing it from being inlined.
2011 DisallowAllBonuses();
2012 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2013 }
2014 }
2015 }
2016
2017 Threshold += TTI.adjustInliningThreshold(&Call);
2018
2019 // Finally, take the target-specific inlining threshold multiplier into
2020 // account.
2021 Threshold *= TTI.getInliningThresholdMultiplier();
2022
2023 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2024 VectorBonus = Threshold * VectorBonusPercent / 100;
2025
2026 // If there is only one call of the function, and it has internal linkage,
2027 // the cost of inlining it drops dramatically. It may seem odd to update
2028 // Cost in updateThreshold, but the bonus depends on the logic in this method.
2029 if (isSoleCallToLocalFunction(Call, F)) {
2031 StaticBonusApplied = LastCallToStaticBonus;
2032 }
2033}
2034
2035bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2036 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2037 // First try to handle simplified comparisons.
2039 return true;
2040
2041 if (I.getOpcode() == Instruction::FCmp)
2042 return false;
2043
2044 // Otherwise look for a comparison between constant offset pointers with
2045 // a common base.
2046 Value *LHSBase, *RHSBase;
2047 APInt LHSOffset, RHSOffset;
2048 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2049 if (LHSBase) {
2050 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2051 if (RHSBase && LHSBase == RHSBase) {
2052 // We have common bases, fold the icmp to a constant based on the
2053 // offsets.
2054 SimplifiedValues[&I] = ConstantInt::getBool(
2055 I.getType(),
2056 ICmpInst::compare(LHSOffset, RHSOffset, I.getPredicate()));
2057 ++NumConstantPtrCmps;
2058 return true;
2059 }
2060 }
2061
2062 auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2063 for (auto *User : I.users())
2064 if (auto *Instr = dyn_cast<Instruction>(User))
2065 if (!Instr->getMetadata(LLVMContext::MD_make_implicit))
2066 return false;
2067 return true;
2068 };
2069
2070 // If the comparison is an equality comparison with null, we can simplify it
2071 // if we know the value (argument) can't be null
2072 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1))) {
2073 if (isKnownNonNullInCallee(I.getOperand(0))) {
2074 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2075 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
2077 return true;
2078 }
2079 // Implicit null checks act as unconditional branches and their comparisons
2080 // should be treated as simplified and free of cost.
2081 if (isImplicitNullCheckCmp(I))
2082 return true;
2083 }
2084 return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
2085}
2086
2087bool CallAnalyzer::visitSub(BinaryOperator &I) {
2088 // Try to handle a special case: we can fold computing the difference of two
2089 // constant-related pointers.
2090 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2091 Value *LHSBase, *RHSBase;
2092 APInt LHSOffset, RHSOffset;
2093 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2094 if (LHSBase) {
2095 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2096 if (RHSBase && LHSBase == RHSBase) {
2097 // We have common bases, fold the subtract to a constant based on the
2098 // offsets.
2099 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2100 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2101 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
2102 SimplifiedValues[&I] = C;
2103 ++NumConstantPtrDiffs;
2104 return true;
2105 }
2106 }
2107 }
2108
2109 // Otherwise, fall back to the generic logic for simplifying and handling
2110 // instructions.
2111 return Base::visitSub(I);
2112}
2113
2114bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2115 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2116 Constant *CLHS = dyn_cast<Constant>(LHS);
2117 if (!CLHS)
2118 CLHS = SimplifiedValues.lookup(LHS);
2119 Constant *CRHS = dyn_cast<Constant>(RHS);
2120 if (!CRHS)
2121 CRHS = SimplifiedValues.lookup(RHS);
2122
2123 Value *SimpleV = nullptr;
2124 if (auto FI = dyn_cast<FPMathOperator>(&I))
2125 SimpleV = simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
2126 FI->getFastMathFlags(), DL);
2127 else
2128 SimpleV =
2129 simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
2130
2131 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2132 SimplifiedValues[&I] = C;
2133
2134 if (SimpleV)
2135 return true;
2136
2137 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2138 disableSROA(LHS);
2139 disableSROA(RHS);
2140
2141 // If the instruction is floating point, and the target says this operation
2142 // is expensive, this may eventually become a library call. Treat the cost
2143 // as such. Unless it's fneg which can be implemented with an xor.
2144 using namespace llvm::PatternMatch;
2145 if (I.getType()->isFloatingPointTy() &&
2147 !match(&I, m_FNeg(m_Value())))
2148 onCallPenalty();
2149
2150 return false;
2151}
2152
2153bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2154 Value *Op = I.getOperand(0);
2155 Constant *COp = dyn_cast<Constant>(Op);
2156 if (!COp)
2157 COp = SimplifiedValues.lookup(Op);
2158
2159 Value *SimpleV = simplifyFNegInst(
2160 COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
2161
2162 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2163 SimplifiedValues[&I] = C;
2164
2165 if (SimpleV)
2166 return true;
2167
2168 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2169 disableSROA(Op);
2170
2171 return false;
2172}
2173
2174bool CallAnalyzer::visitLoad(LoadInst &I) {
2175 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2176 return true;
2177
2178 // If the data is already loaded from this address and hasn't been clobbered
2179 // by any stores or calls, this load is likely to be redundant and can be
2180 // eliminated.
2181 if (EnableLoadElimination &&
2182 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
2183 onLoadEliminationOpportunity();
2184 return true;
2185 }
2186
2187 onMemAccess();
2188 return false;
2189}
2190
2191bool CallAnalyzer::visitStore(StoreInst &I) {
2192 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2193 return true;
2194
2195 // The store can potentially clobber loads and prevent repeated loads from
2196 // being eliminated.
2197 // FIXME:
2198 // 1. We can probably keep an initial set of eliminatable loads substracted
2199 // from the cost even when we finally see a store. We just need to disable
2200 // *further* accumulation of elimination savings.
2201 // 2. We should probably at some point thread MemorySSA for the callee into
2202 // this and then use that to actually compute *really* precise savings.
2203 disableLoadElimination();
2204
2205 onMemAccess();
2206 return false;
2207}
2208
2209bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2210 // Constant folding for extract value is trivial.
2212 return true;
2213
2214 // SROA can't look through these, but they may be free.
2215 return Base::visitExtractValue(I);
2216}
2217
2218bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2219 // Constant folding for insert value is trivial.
2221 return true;
2222
2223 // SROA can't look through these, but they may be free.
2224 return Base::visitInsertValue(I);
2225}
2226
2227/// Try to simplify a call site.
2228///
2229/// Takes a concrete function and callsite and tries to actually simplify it by
2230/// analyzing the arguments and call itself with instsimplify. Returns true if
2231/// it has simplified the callsite to some other entity (a constant), making it
2232/// free.
2233bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2234 // FIXME: Using the instsimplify logic directly for this is inefficient
2235 // because we have to continually rebuild the argument list even when no
2236 // simplifications can be performed. Until that is fixed with remapping
2237 // inside of instsimplify, directly constant fold calls here.
2238 if (!canConstantFoldCallTo(&Call, F))
2239 return false;
2240
2241 // Try to re-map the arguments to constants.
2242 SmallVector<Constant *, 4> ConstantArgs;
2243 ConstantArgs.reserve(Call.arg_size());
2244 for (Value *I : Call.args()) {
2245 Constant *C = dyn_cast<Constant>(I);
2246 if (!C)
2247 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
2248 if (!C)
2249 return false; // This argument doesn't map to a constant.
2250
2251 ConstantArgs.push_back(C);
2252 }
2253 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
2254 SimplifiedValues[&Call] = C;
2255 return true;
2256 }
2257
2258 return false;
2259}
2260
2261bool CallAnalyzer::visitCallBase(CallBase &Call) {
2262 if (!onCallBaseVisitStart(Call))
2263 return true;
2264
2265 if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
2266 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
2267 // This aborts the entire analysis.
2268 ExposesReturnsTwice = true;
2269 return false;
2270 }
2271 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
2272 ContainsNoDuplicateCall = true;
2273
2274 Function *F = Call.getCalledFunction();
2275 bool IsIndirectCall = !F;
2276 if (IsIndirectCall) {
2277 // Check if this happens to be an indirect function call to a known function
2278 // in this inline context. If not, we've done all we can.
2279 Value *Callee = Call.getCalledOperand();
2280 F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
2281 if (!F || F->getFunctionType() != Call.getFunctionType()) {
2282 onCallArgumentSetup(Call);
2283
2284 if (!Call.onlyReadsMemory())
2285 disableLoadElimination();
2286 return Base::visitCallBase(Call);
2287 }
2288 }
2289
2290 assert(F && "Expected a call to a known function");
2291
2292 // When we have a concrete function, first try to simplify it directly.
2293 if (simplifyCallSite(F, Call))
2294 return true;
2295
2296 // Next check if it is an intrinsic we know about.
2297 // FIXME: Lift this into part of the InstVisitor.
2298 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
2299 switch (II->getIntrinsicID()) {
2300 default:
2301 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
2302 disableLoadElimination();
2303 return Base::visitCallBase(Call);
2304
2305 case Intrinsic::load_relative:
2306 onLoadRelativeIntrinsic();
2307 return false;
2308
2309 case Intrinsic::memset:
2310 case Intrinsic::memcpy:
2311 case Intrinsic::memmove:
2312 disableLoadElimination();
2313 // SROA can usually chew through these intrinsics, but they aren't free.
2314 return false;
2315 case Intrinsic::icall_branch_funnel:
2316 case Intrinsic::localescape:
2317 HasUninlineableIntrinsic = true;
2318 return false;
2319 case Intrinsic::vastart:
2320 InitsVargArgs = true;
2321 return false;
2322 case Intrinsic::launder_invariant_group:
2323 case Intrinsic::strip_invariant_group:
2324 if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
2325 SROAArgValues[II] = SROAArg;
2326 return true;
2327 case Intrinsic::is_constant:
2328 return simplifyIntrinsicCallIsConstant(Call);
2329 case Intrinsic::objectsize:
2330 return simplifyIntrinsicCallObjectSize(Call);
2331 }
2332 }
2333
2334 if (F == Call.getFunction()) {
2335 // This flag will fully abort the analysis, so don't bother with anything
2336 // else.
2337 IsRecursiveCall = true;
2338 if (!AllowRecursiveCall)
2339 return false;
2340 }
2341
2342 if (TTI.isLoweredToCall(F)) {
2343 onLoweredCall(F, Call, IsIndirectCall);
2344 }
2345
2346 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2347 disableLoadElimination();
2348 return Base::visitCallBase(Call);
2349}
2350
2351bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2352 // At least one return instruction will be free after inlining.
2353 bool Free = !HasReturn;
2354 HasReturn = true;
2355 return Free;
2356}
2357
2358bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2359 // We model unconditional branches as essentially free -- they really
2360 // shouldn't exist at all, but handling them makes the behavior of the
2361 // inliner more regular and predictable. Interestingly, conditional branches
2362 // which will fold away are also free.
2363 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
2364 BI.getMetadata(LLVMContext::MD_make_implicit) ||
2365 isa_and_nonnull<ConstantInt>(
2366 SimplifiedValues.lookup(BI.getCondition()));
2367}
2368
2369bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2370 bool CheckSROA = SI.getType()->isPointerTy();
2371 Value *TrueVal = SI.getTrueValue();
2372 Value *FalseVal = SI.getFalseValue();
2373
2374 Constant *TrueC = dyn_cast<Constant>(TrueVal);
2375 if (!TrueC)
2376 TrueC = SimplifiedValues.lookup(TrueVal);
2377 Constant *FalseC = dyn_cast<Constant>(FalseVal);
2378 if (!FalseC)
2379 FalseC = SimplifiedValues.lookup(FalseVal);
2380 Constant *CondC =
2381 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
2382
2383 if (!CondC) {
2384 // Select C, X, X => X
2385 if (TrueC == FalseC && TrueC) {
2386 SimplifiedValues[&SI] = TrueC;
2387 return true;
2388 }
2389
2390 if (!CheckSROA)
2391 return Base::visitSelectInst(SI);
2392
2393 std::pair<Value *, APInt> TrueBaseAndOffset =
2394 ConstantOffsetPtrs.lookup(TrueVal);
2395 std::pair<Value *, APInt> FalseBaseAndOffset =
2396 ConstantOffsetPtrs.lookup(FalseVal);
2397 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2398 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2399
2400 if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
2401 SROAArgValues[&SI] = SROAArg;
2402 return true;
2403 }
2404
2405 return Base::visitSelectInst(SI);
2406 }
2407
2408 // Select condition is a constant.
2409 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2410 : (CondC->isNullValue()) ? FalseVal
2411 : nullptr;
2412 if (!SelectedV) {
2413 // Condition is a vector constant that is not all 1s or all 0s. If all
2414 // operands are constants, ConstantFoldSelectInstruction() can handle the
2415 // cases such as select vectors.
2416 if (TrueC && FalseC) {
2417 if (auto *C = ConstantFoldSelectInstruction(CondC, TrueC, FalseC)) {
2418 SimplifiedValues[&SI] = C;
2419 return true;
2420 }
2421 }
2422 return Base::visitSelectInst(SI);
2423 }
2424
2425 // Condition is either all 1s or all 0s. SI can be simplified.
2426 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
2427 SimplifiedValues[&SI] = SelectedC;
2428 return true;
2429 }
2430
2431 if (!CheckSROA)
2432 return true;
2433
2434 std::pair<Value *, APInt> BaseAndOffset =
2435 ConstantOffsetPtrs.lookup(SelectedV);
2436 if (BaseAndOffset.first) {
2437 ConstantOffsetPtrs[&SI] = BaseAndOffset;
2438
2439 if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
2440 SROAArgValues[&SI] = SROAArg;
2441 }
2442
2443 return true;
2444}
2445
2446bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2447 // We model unconditional switches as free, see the comments on handling
2448 // branches.
2449 if (isa<ConstantInt>(SI.getCondition()))
2450 return true;
2451 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
2452 if (isa<ConstantInt>(V))
2453 return true;
2454
2455 // Assume the most general case where the switch is lowered into
2456 // either a jump table, bit test, or a balanced binary tree consisting of
2457 // case clusters without merging adjacent clusters with the same
2458 // destination. We do not consider the switches that are lowered with a mix
2459 // of jump table/bit test/binary search tree. The cost of the switch is
2460 // proportional to the size of the tree or the size of jump table range.
2461 //
2462 // NB: We convert large switches which are just used to initialize large phi
2463 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2464 // inlining those. It will prevent inlining in cases where the optimization
2465 // does not (yet) fire.
2466
2467 unsigned JumpTableSize = 0;
2468 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2469 unsigned NumCaseCluster =
2470 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
2471
2472 onFinalizeSwitch(JumpTableSize, NumCaseCluster, SI.defaultDestUndefined());
2473 return false;
2474}
2475
2476bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2477 // We never want to inline functions that contain an indirectbr. This is
2478 // incorrect because all the blockaddress's (in static global initializers
2479 // for example) would be referring to the original function, and this
2480 // indirect jump would jump from the inlined copy of the function into the
2481 // original function which is extremely undefined behavior.
2482 // FIXME: This logic isn't really right; we can safely inline functions with
2483 // indirectbr's as long as no other function or global references the
2484 // blockaddress of a block within the current function.
2485 HasIndirectBr = true;
2486 return false;
2487}
2488
2489bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2490 // FIXME: It's not clear that a single instruction is an accurate model for
2491 // the inline cost of a resume instruction.
2492 return false;
2493}
2494
2495bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2496 // FIXME: It's not clear that a single instruction is an accurate model for
2497 // the inline cost of a cleanupret instruction.
2498 return false;
2499}
2500
2501bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2502 // FIXME: It's not clear that a single instruction is an accurate model for
2503 // the inline cost of a catchret instruction.
2504 return false;
2505}
2506
2507bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2508 // FIXME: It might be reasonably to discount the cost of instructions leading
2509 // to unreachable as they have the lowest possible impact on both runtime and
2510 // code size.
2511 return true; // No actual code is needed for unreachable.
2512}
2513
2514bool CallAnalyzer::visitInstruction(Instruction &I) {
2515 // Some instructions are free. All of the free intrinsics can also be
2516 // handled by SROA, etc.
2519 return true;
2520
2521 // We found something we don't understand or can't handle. Mark any SROA-able
2522 // values in the operand list as no longer viable.
2523 for (const Use &Op : I.operands())
2524 disableSROA(Op);
2525
2526 return false;
2527}
2528
2529/// Analyze a basic block for its contribution to the inline cost.
2530///
2531/// This method walks the analyzer over every instruction in the given basic
2532/// block and accounts for their cost during inlining at this callsite. It
2533/// aborts early if the threshold has been exceeded or an impossible to inline
2534/// construct has been detected. It returns false if inlining is no longer
2535/// viable, and true if inlining remains viable.
2537CallAnalyzer::analyzeBlock(BasicBlock *BB,
2538 SmallPtrSetImpl<const Value *> &EphValues) {
2539 for (Instruction &I : *BB) {
2540 // FIXME: Currently, the number of instructions in a function regardless of
2541 // our ability to simplify them during inline to constants or dead code,
2542 // are actually used by the vector bonus heuristic. As long as that's true,
2543 // we have to special case debug intrinsics here to prevent differences in
2544 // inlining due to debug symbols. Eventually, the number of unsimplified
2545 // instructions shouldn't factor into the cost computation, but until then,
2546 // hack around it here.
2547 // Similarly, skip pseudo-probes.
2548 if (I.isDebugOrPseudoInst())
2549 continue;
2550
2551 // Skip ephemeral values.
2552 if (EphValues.count(&I))
2553 continue;
2554
2555 ++NumInstructions;
2556 if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
2557 ++NumVectorInstructions;
2558
2559 // If the instruction simplified to a constant, there is no cost to this
2560 // instruction. Visit the instructions using our InstVisitor to account for
2561 // all of the per-instruction logic. The visit tree returns true if we
2562 // consumed the instruction in any way, and false if the instruction's base
2563 // cost should count against inlining.
2564 onInstructionAnalysisStart(&I);
2565
2566 if (Base::visit(&I))
2567 ++NumInstructionsSimplified;
2568 else
2569 onMissedSimplification();
2570
2571 onInstructionAnalysisFinish(&I);
2572 using namespace ore;
2573 // If the visit this instruction detected an uninlinable pattern, abort.
2575 if (IsRecursiveCall && !AllowRecursiveCall)
2576 IR = InlineResult::failure("recursive");
2577 else if (ExposesReturnsTwice)
2578 IR = InlineResult::failure("exposes returns twice");
2579 else if (HasDynamicAlloca)
2580 IR = InlineResult::failure("dynamic alloca");
2581 else if (HasIndirectBr)
2582 IR = InlineResult::failure("indirect branch");
2583 else if (HasUninlineableIntrinsic)
2584 IR = InlineResult::failure("uninlinable intrinsic");
2585 else if (InitsVargArgs)
2586 IR = InlineResult::failure("varargs");
2587 if (!IR.isSuccess()) {
2588 if (ORE)
2589 ORE->emit([&]() {
2590 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2591 &CandidateCall)
2592 << NV("Callee", &F) << " has uninlinable pattern ("
2593 << NV("InlineResult", IR.getFailureReason())
2594 << ") and cost is not fully computed";
2595 });
2596 return IR;
2597 }
2598
2599 // If the caller is a recursive function then we don't want to inline
2600 // functions which allocate a lot of stack space because it would increase
2601 // the caller stack usage dramatically.
2602 if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2603 auto IR =
2604 InlineResult::failure("recursive and allocates too much stack space");
2605 if (ORE)
2606 ORE->emit([&]() {
2607 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2608 &CandidateCall)
2609 << NV("Callee", &F) << " is "
2610 << NV("InlineResult", IR.getFailureReason())
2611 << ". Cost is not fully computed";
2612 });
2613 return IR;
2614 }
2615
2616 if (shouldStop())
2617 return InlineResult::failure(
2618 "Call site analysis is not favorable to inlining.");
2619 }
2620
2621 return InlineResult::success();
2622}
2623
2624/// Compute the base pointer and cumulative constant offsets for V.
2625///
2626/// This strips all constant offsets off of V, leaving it the base pointer, and
2627/// accumulates the total constant offset applied in the returned constant. It
2628/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2629/// no constant offsets applied.
2630ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2631 if (!V->getType()->isPointerTy())
2632 return nullptr;
2633
2634 unsigned AS = V->getType()->getPointerAddressSpace();
2635 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2636 APInt Offset = APInt::getZero(IntPtrWidth);
2637
2638 // Even though we don't look through PHI nodes, we could be called on an
2639 // instruction in an unreachable block, which may be on a cycle.
2641 Visited.insert(V);
2642 do {
2643 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
2644 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
2645 return nullptr;
2646 V = GEP->getPointerOperand();
2647 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
2648 if (GA->isInterposable())
2649 break;
2650 V = GA->getAliasee();
2651 } else {
2652 break;
2653 }
2654 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2655 } while (Visited.insert(V).second);
2656
2657 Type *IdxPtrTy = DL.getIndexType(V->getType());
2658 return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
2659}
2660
2661/// Find dead blocks due to deleted CFG edges during inlining.
2662///
2663/// If we know the successor of the current block, \p CurrBB, has to be \p
2664/// NextBB, the other successors of \p CurrBB are dead if these successors have
2665/// no live incoming CFG edges. If one block is found to be dead, we can
2666/// continue growing the dead block list by checking the successors of the dead
2667/// blocks to see if all their incoming edges are dead or not.
2668void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2669 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2670 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2671 // known successor which is not the one under exam.
2672 return (DeadBlocks.count(Pred) ||
2673 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
2674 };
2675
2676 auto IsNewlyDead = [&](BasicBlock *BB) {
2677 // If all the edges to a block are dead, the block is also dead.
2678 return (!DeadBlocks.count(BB) &&
2680 [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2681 };
2682
2683 for (BasicBlock *Succ : successors(CurrBB)) {
2684 if (Succ == NextBB || !IsNewlyDead(Succ))
2685 continue;
2687 NewDead.push_back(Succ);
2688 while (!NewDead.empty()) {
2689 BasicBlock *Dead = NewDead.pop_back_val();
2690 if (DeadBlocks.insert(Dead).second)
2691 // Continue growing the dead block lists.
2692 for (BasicBlock *S : successors(Dead))
2693 if (IsNewlyDead(S))
2694 NewDead.push_back(S);
2695 }
2696 }
2697}
2698
2699/// Analyze a call site for potential inlining.
2700///
2701/// Returns true if inlining this call is viable, and false if it is not
2702/// viable. It computes the cost and adjusts the threshold based on numerous
2703/// factors and heuristics. If this method returns false but the computed cost
2704/// is below the computed threshold, then inlining was forcibly disabled by
2705/// some artifact of the routine.
2706InlineResult CallAnalyzer::analyze() {
2707 ++NumCallsAnalyzed;
2708
2709 auto Result = onAnalysisStart();
2710 if (!Result.isSuccess())
2711 return Result;
2712
2713 if (F.empty())
2714 return InlineResult::success();
2715
2716 Function *Caller = CandidateCall.getFunction();
2717 // Check if the caller function is recursive itself.
2718 for (User *U : Caller->users()) {
2719 CallBase *Call = dyn_cast<CallBase>(U);
2720 if (Call && Call->getFunction() == Caller) {
2721 IsCallerRecursive = true;
2722 break;
2723 }
2724 }
2725
2726 // Populate our simplified values by mapping from function arguments to call
2727 // arguments with known important simplifications.
2728 auto CAI = CandidateCall.arg_begin();
2729 for (Argument &FAI : F.args()) {
2730 assert(CAI != CandidateCall.arg_end());
2731 if (Constant *C = dyn_cast<Constant>(CAI))
2732 SimplifiedValues[&FAI] = C;
2733
2734 Value *PtrArg = *CAI;
2735 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2736 ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
2737
2738 // We can SROA any pointer arguments derived from alloca instructions.
2739 if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2740 SROAArgValues[&FAI] = SROAArg;
2741 onInitializeSROAArg(SROAArg);
2742 EnabledSROAAllocas.insert(SROAArg);
2743 }
2744 }
2745 ++CAI;
2746 }
2747 NumConstantArgs = SimplifiedValues.size();
2748 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2749 NumAllocaArgs = SROAArgValues.size();
2750
2751 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
2752 // the ephemeral values multiple times (and they're completely determined by
2753 // the callee, so this is purely duplicate work).
2755 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
2756
2757 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2758 // adding basic blocks of the callee which can be proven to be dead for this
2759 // particular call site in order to get more accurate cost estimates. This
2760 // requires a somewhat heavyweight iteration pattern: we need to walk the
2761 // basic blocks in a breadth-first order as we insert live successors. To
2762 // accomplish this, prioritizing for small iterations because we exit after
2763 // crossing our threshold, we use a small-size optimized SetVector.
2765 BBSetVector BBWorklist;
2766 BBWorklist.insert(&F.getEntryBlock());
2767
2768 // Note that we *must not* cache the size, this loop grows the worklist.
2769 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2770 if (shouldStop())
2771 break;
2772
2773 BasicBlock *BB = BBWorklist[Idx];
2774 if (BB->empty())
2775 continue;
2776
2777 onBlockStart(BB);
2778
2779 // Disallow inlining a blockaddress with uses other than strictly callbr.
2780 // A blockaddress only has defined behavior for an indirect branch in the
2781 // same function, and we do not currently support inlining indirect
2782 // branches. But, the inliner may not see an indirect branch that ends up
2783 // being dead code at a particular call site. If the blockaddress escapes
2784 // the function, e.g., via a global variable, inlining may lead to an
2785 // invalid cross-function reference.
2786 // FIXME: pr/39560: continue relaxing this overt restriction.
2787 if (BB->hasAddressTaken())
2788 for (User *U : BlockAddress::get(&*BB)->users())
2789 if (!isa<CallBrInst>(*U))
2790 return InlineResult::failure("blockaddress used outside of callbr");
2791
2792 // Analyze the cost of this block. If we blow through the threshold, this
2793 // returns false, and we can bail on out.
2794 InlineResult IR = analyzeBlock(BB, EphValues);
2795 if (!IR.isSuccess())
2796 return IR;
2797
2798 Instruction *TI = BB->getTerminator();
2799
2800 // Add in the live successors by first checking whether we have terminator
2801 // that may be simplified based on the values simplified by this call.
2802 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
2803 if (BI->isConditional()) {
2804 Value *Cond = BI->getCondition();
2805 if (ConstantInt *SimpleCond =
2806 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2807 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
2808 BBWorklist.insert(NextBB);
2809 KnownSuccessors[BB] = NextBB;
2810 findDeadBlocks(BB, NextBB);
2811 continue;
2812 }
2813 }
2814 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
2815 Value *Cond = SI->getCondition();
2816 if (ConstantInt *SimpleCond =
2817 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2818 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
2819 BBWorklist.insert(NextBB);
2820 KnownSuccessors[BB] = NextBB;
2821 findDeadBlocks(BB, NextBB);
2822 continue;
2823 }
2824 }
2825
2826 // If we're unable to select a particular successor, just count all of
2827 // them.
2828 for (BasicBlock *Succ : successors(BB))
2829 BBWorklist.insert(Succ);
2830
2831 onBlockAnalyzed(BB);
2832 }
2833
2834 // If this is a noduplicate call, we can still inline as long as
2835 // inlining this would cause the removal of the caller (so the instruction
2836 // is not actually duplicated, just moved).
2837 if (!isSoleCallToLocalFunction(CandidateCall, F) && ContainsNoDuplicateCall)
2838 return InlineResult::failure("noduplicate");
2839
2840 // If the callee's stack size exceeds the user-specified threshold,
2841 // do not let it be inlined.
2842 // The command line option overrides a limit set in the function attributes.
2843 size_t FinalStackSizeThreshold = StackSizeThreshold;
2844 if (!StackSizeThreshold.getNumOccurrences())
2845 if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
2847 FinalStackSizeThreshold = *AttrMaxStackSize;
2848 if (AllocatedSize > FinalStackSizeThreshold)
2849 return InlineResult::failure("stacksize");
2850
2851 return finalizeAnalysis();
2852}
2853
2854void InlineCostCallAnalyzer::print(raw_ostream &OS) {
2855#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
2857 F.print(OS, &Writer);
2858 DEBUG_PRINT_STAT(NumConstantArgs);
2859 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2860 DEBUG_PRINT_STAT(NumAllocaArgs);
2861 DEBUG_PRINT_STAT(NumConstantPtrCmps);
2862 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2863 DEBUG_PRINT_STAT(NumInstructionsSimplified);
2864 DEBUG_PRINT_STAT(NumInstructions);
2865 DEBUG_PRINT_STAT(SROACostSavings);
2866 DEBUG_PRINT_STAT(SROACostSavingsLost);
2867 DEBUG_PRINT_STAT(LoadEliminationCost);
2868 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2870 DEBUG_PRINT_STAT(Threshold);
2871#undef DEBUG_PRINT_STAT
2872}
2873
2874#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2875/// Dump stats about this call's analysis.
2876LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
2877#endif
2878
2879/// Test that there are no attribute conflicts between Caller and Callee
2880/// that prevent inlining.
2882 Function *Caller, Function *Callee, TargetTransformInfo &TTI,
2883 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
2884 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2885 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2886 // object, and always returns the same object (which is overwritten on each
2887 // GetTLI call). Therefore we copy the first result.
2888 auto CalleeTLI = GetTLI(*Callee);
2889 return (IgnoreTTIInlineCompatible ||
2890 TTI.areInlineCompatible(Caller, Callee)) &&
2891 GetTLI(*Caller).areInlineCompatible(CalleeTLI,
2893 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
2894}
2895
2897 const DataLayout &DL) {
2898 int64_t Cost = 0;
2899 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2900 if (Call.isByValArgument(I)) {
2901 // We approximate the number of loads and stores needed by dividing the
2902 // size of the byval type by the target's pointer size.
2903 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2904 unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));
2905 unsigned AS = PTy->getAddressSpace();
2906 unsigned PointerSize = DL.getPointerSizeInBits(AS);
2907 // Ceiling division.
2908 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2909
2910 // If it generates more than 8 stores it is likely to be expanded as an
2911 // inline memcpy so we take that as an upper bound. Otherwise we assume
2912 // one load and one store per word copied.
2913 // FIXME: The maxStoresPerMemcpy setting from the target should be used
2914 // here instead of a magic number of 8, but it's not available via
2915 // DataLayout.
2916 NumStores = std::min(NumStores, 8U);
2917
2918 Cost += 2 * NumStores * InstrCost;
2919 } else {
2920 // For non-byval arguments subtract off one instruction per call
2921 // argument.
2922 Cost += InstrCost;
2923 }
2924 }
2925 // The call instruction also disappears after inlining.
2926 Cost += InstrCost;
2927 Cost += TTI.getInlineCallPenalty(Call.getCaller(), Call, CallPenalty);
2928
2929 return std::min<int64_t>(Cost, INT_MAX);
2930}
2931
2933 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2934 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2935 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2938 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2939 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);
2940}
2941
2943 CallBase &Call, TargetTransformInfo &CalleeTTI,
2944 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2947 const InlineParams Params = {/* DefaultThreshold*/ 0,
2948 /*HintThreshold*/ {},
2949 /*ColdThreshold*/ {},
2950 /*OptSizeThreshold*/ {},
2951 /*OptMinSizeThreshold*/ {},
2952 /*HotCallSiteThreshold*/ {},
2953 /*LocallyHotCallSiteThreshold*/ {},
2954 /*ColdCallSiteThreshold*/ {},
2955 /*ComputeFullInlineCost*/ true,
2956 /*EnableDeferral*/ true};
2957
2958 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
2959 GetAssumptionCache, GetBFI, PSI, ORE, true,
2960 /*IgnoreThreshold*/ true);
2961 auto R = CA.analyze();
2962 if (!R.isSuccess())
2963 return std::nullopt;
2964 return CA.getCost();
2965}
2966
2967std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
2968 CallBase &Call, TargetTransformInfo &CalleeTTI,
2969 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2972 InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI,
2973 ORE, *Call.getCalledFunction(), Call);
2974 auto R = CFA.analyze();
2975 if (!R.isSuccess())
2976 return std::nullopt;
2977 return CFA.features();
2978}
2979
2981 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
2982 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
2983
2984 // Cannot inline indirect calls.
2985 if (!Callee)
2986 return InlineResult::failure("indirect call");
2987
2988 // When callee coroutine function is inlined into caller coroutine function
2989 // before coro-split pass,
2990 // coro-early pass can not handle this quiet well.
2991 // So we won't inline the coroutine function if it have not been unsplited
2992 if (Callee->isPresplitCoroutine())
2993 return InlineResult::failure("unsplited coroutine call");
2994
2995 // Never inline calls with byval arguments that does not have the alloca
2996 // address space. Since byval arguments can be replaced with a copy to an
2997 // alloca, the inlined code would need to be adjusted to handle that the
2998 // argument is in the alloca address space (so it is a little bit complicated
2999 // to solve).
3000 unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace();
3001 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
3002 if (Call.isByValArgument(I)) {
3003 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
3004 if (PTy->getAddressSpace() != AllocaAS)
3005 return InlineResult::failure("byval arguments without alloca"
3006 " address space");
3007 }
3008
3009 // Calls to functions with always-inline attributes should be inlined
3010 // whenever possible.
3011 if (Call.hasFnAttr(Attribute::AlwaysInline)) {
3012 if (Call.getAttributes().hasFnAttr(Attribute::NoInline))
3013 return InlineResult::failure("noinline call site attribute");
3014
3015 auto IsViable = isInlineViable(*Callee);
3016 if (IsViable.isSuccess())
3017 return InlineResult::success();
3018 return InlineResult::failure(IsViable.getFailureReason());
3019 }
3020
3021 // Never inline functions with conflicting attributes (unless callee has
3022 // always-inline attribute).
3023 Function *Caller = Call.getCaller();
3024 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
3025 return InlineResult::failure("conflicting attributes");
3026
3027 // Don't inline this call if the caller has the optnone attribute.
3028 if (Caller->hasOptNone())
3029 return InlineResult::failure("optnone attribute");
3030
3031 // Don't inline a function that treats null pointer as valid into a caller
3032 // that does not have this attribute.
3033 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
3034 return InlineResult::failure("nullptr definitions incompatible");
3035
3036 // Don't inline functions which can be interposed at link-time.
3037 if (Callee->isInterposable())
3038 return InlineResult::failure("interposable");
3039
3040 // Don't inline functions marked noinline.
3041 if (Callee->hasFnAttribute(Attribute::NoInline))
3042 return InlineResult::failure("noinline function attribute");
3043
3044 // Don't inline call sites marked noinline.
3045 if (Call.isNoInline())
3046 return InlineResult::failure("noinline call site attribute");
3047
3048 return std::nullopt;
3049}
3050
3052 CallBase &Call, Function *Callee, const InlineParams &Params,
3053 TargetTransformInfo &CalleeTTI,
3054 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3055 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3058
3059 auto UserDecision =
3060 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3061
3062 if (UserDecision) {
3063 if (UserDecision->isSuccess())
3064 return llvm::InlineCost::getAlways("always inline attribute");
3065 return llvm::InlineCost::getNever(UserDecision->getFailureReason());
3066 }
3067
3068 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
3069 << "... (caller:" << Call.getCaller()->getName()
3070 << ")\n");
3071
3072 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3073 GetAssumptionCache, GetBFI, PSI, ORE);
3074 InlineResult ShouldInline = CA.analyze();
3075
3076 LLVM_DEBUG(CA.dump());
3077
3078 // Always make cost benefit based decision explicit.
3079 // We use always/never here since threshold is not meaningful,
3080 // as it's not what drives cost-benefit analysis.
3081 if (CA.wasDecidedByCostBenefit()) {
3082 if (ShouldInline.isSuccess())
3083 return InlineCost::getAlways("benefit over cost",
3084 CA.getCostBenefitPair());
3085 else
3086 return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());
3087 }
3088
3089 if (CA.wasDecidedByCostThreshold())
3090 return InlineCost::get(CA.getCost(), CA.getThreshold(),
3091 CA.getStaticBonusApplied());
3092
3093 // No details on how the decision was made, simply return always or never.
3094 return ShouldInline.isSuccess()
3095 ? InlineCost::getAlways("empty function")
3096 : InlineCost::getNever(ShouldInline.getFailureReason());
3097}
3098
3100 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
3101 for (BasicBlock &BB : F) {
3102 // Disallow inlining of functions which contain indirect branches.
3103 if (isa<IndirectBrInst>(BB.getTerminator()))
3104 return InlineResult::failure("contains indirect branches");
3105
3106 // Disallow inlining of blockaddresses which are used by non-callbr
3107 // instructions.
3108 if (BB.hasAddressTaken())
3109 for (User *U : BlockAddress::get(&BB)->users())
3110 if (!isa<CallBrInst>(*U))
3111 return InlineResult::failure("blockaddress used outside of callbr");
3112
3113 for (auto &II : BB) {
3114 CallBase *Call = dyn_cast<CallBase>(&II);
3115 if (!Call)
3116 continue;
3117
3118 // Disallow recursive calls.
3119 Function *Callee = Call->getCalledFunction();
3120 if (&F == Callee)
3121 return InlineResult::failure("recursive call");
3122
3123 // Disallow calls which expose returns-twice to a function not previously
3124 // attributed as such.
3125 if (!ReturnsTwice && isa<CallInst>(Call) &&
3126 cast<CallInst>(Call)->canReturnTwice())
3127 return InlineResult::failure("exposes returns-twice attribute");
3128
3129 if (Callee)
3130 switch (Callee->getIntrinsicID()) {
3131 default:
3132 break;
3133 case llvm::Intrinsic::icall_branch_funnel:
3134 // Disallow inlining of @llvm.icall.branch.funnel because current
3135 // backend can't separate call targets from call arguments.
3136 return InlineResult::failure(
3137 "disallowed inlining of @llvm.icall.branch.funnel");
3138 case llvm::Intrinsic::localescape:
3139 // Disallow inlining functions that call @llvm.localescape. Doing this
3140 // correctly would require major changes to the inliner.
3141 return InlineResult::failure(
3142 "disallowed inlining of @llvm.localescape");
3143 case llvm::Intrinsic::vastart:
3144 // Disallow inlining of functions that initialize VarArgs with
3145 // va_start.
3146 return InlineResult::failure(
3147 "contains VarArgs initialized with va_start");
3148 }
3149 }
3150 }
3151
3152 return InlineResult::success();
3153}
3154
3155// APIs to create InlineParams based on command line flags and/or other
3156// parameters.
3157
3159 InlineParams Params;
3160
3161 // This field is the threshold to use for a callee by default. This is
3162 // derived from one or more of:
3163 // * optimization or size-optimization levels,
3164 // * a value passed to createFunctionInliningPass function, or
3165 // * the -inline-threshold flag.
3166 // If the -inline-threshold flag is explicitly specified, that is used
3167 // irrespective of anything else.
3168 if (InlineThreshold.getNumOccurrences() > 0)
3170 else
3171 Params.DefaultThreshold = Threshold;
3172
3173 // Set the HintThreshold knob from the -inlinehint-threshold.
3175
3176 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3178
3179 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3180 // populate LocallyHotCallSiteThreshold. Later, we populate
3181 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3182 // we know that optimization level is O3 (in the getInlineParams variant that
3183 // takes the opt and size levels).
3184 // FIXME: Remove this check (and make the assignment unconditional) after
3185 // addressing size regression issues at O2.
3186 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3188
3189 // Set the ColdCallSiteThreshold knob from the
3190 // -inline-cold-callsite-threshold.
3192
3193 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3194 // -inlinehint-threshold commandline option is not explicitly given. If that
3195 // option is present, then its value applies even for callees with size and
3196 // minsize attributes.
3197 // If the -inline-threshold is not specified, set the ColdThreshold from the
3198 // -inlinecold-threshold even if it is not explicitly passed. If
3199 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3200 // explicitly specified to set the ColdThreshold knob
3201 if (InlineThreshold.getNumOccurrences() == 0) {
3205 } else if (ColdThreshold.getNumOccurrences() > 0) {
3207 }
3208 return Params;
3209}
3210
3213}
3214
3215// Compute the default threshold for inlining based on the opt level and the
3216// size opt level.
3217static int computeThresholdFromOptLevels(unsigned OptLevel,
3218 unsigned SizeOptLevel) {
3219 if (OptLevel > 2)
3221 if (SizeOptLevel == 1) // -Os
3223 if (SizeOptLevel == 2) // -Oz
3225 return DefaultThreshold;
3226}
3227
3228InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3229 auto Params =
3230 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3231 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3232 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3233 // when it is specified explicitly.
3234 if (OptLevel > 2)
3236 return Params;
3237}
3238
3243 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3244 [&](Function &F) -> AssumptionCache & {
3246 };
3247 Module *M = F.getParent();
3248 ProfileSummaryInfo PSI(*M);
3249 TargetTransformInfo TTI(M->getDataLayout());
3250 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3251 // In the current implementation, the type of InlineParams doesn't matter as
3252 // the pass serves only for verification of inliner's decisions.
3253 // We can add a flag which determines InlineParams for this run. Right now,
3254 // the default InlineParams are used.
3255 const InlineParams Params = llvm::getInlineParams();
3256 for (BasicBlock &BB : F) {
3257 for (Instruction &I : BB) {
3258 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
3259 Function *CalledFunction = CI->getCalledFunction();
3260 if (!CalledFunction || CalledFunction->isDeclaration())
3261 continue;
3262 OptimizationRemarkEmitter ORE(CalledFunction);
3263 InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI,
3264 GetAssumptionCache, nullptr, &PSI, &ORE);
3265 ICCA.analyze();
3266 OS << " Analyzing call of " << CalledFunction->getName()
3267 << "... (caller:" << CI->getCaller()->getName() << ")\n";
3268 ICCA.print(OS);
3269 OS << "\n";
3270 }
3271 }
3272 }
3273 return PreservedAnalyses::all();
3274}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
SetVector< BasicBlock * > BBSetVector
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:533
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
static bool isColdCallSite(CallBase &CB, BlockFrequencyInfo &CallerBFI)
Return true if the block containing the call site has a BlockFrequency of less than ColdCCRelFreq% of...
Definition: GlobalOpt.cpp:1733
Hexagon Common GEP
iv users
Definition: IVUsers.cpp:48
static cl::opt< int > InlineSavingsMultiplier("inline-savings-multiplier", cl::Hidden, cl::init(8), cl::desc("Multiplier to multiply cycle savings by during inlining"))
static cl::opt< int > InlineThreshold("inline-threshold", cl::Hidden, cl::init(225), cl::desc("Control the amount of inlining to perform (default = 225)"))
static cl::opt< int > CallPenalty("inline-call-penalty", cl::Hidden, cl::init(25), cl::desc("Call penalty that is applied per callsite when inlining"))
static cl::opt< int > HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), cl::desc("Threshold for hot callsites "))
static cl::opt< int > ColdThreshold("inlinecold-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining functions with cold attribute"))
static cl::opt< size_t > RecurStackSizeThreshold("recursive-inline-max-stacksize", cl::Hidden, cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller), cl::desc("Do not inline recursive functions with a stack " "size that exceeds the specified limit"))
static cl::opt< bool > PrintInstructionComments("print-instruction-comments", cl::Hidden, cl::init(false), cl::desc("Prints comments for instruction based on inline cost analysis"))
static cl::opt< int > LocallyHotCallSiteThreshold("locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::desc("Threshold for locally hot callsites "))
static cl::opt< bool > InlineCallerSupersetNoBuiltin("inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true), cl::desc("Allow inlining when caller has a superset of callee's nobuiltin " "attributes."))
static cl::opt< int > HintThreshold("inlinehint-threshold", cl::Hidden, cl::init(325), cl::desc("Threshold for inlining functions with inline hint"))
static cl::opt< size_t > StackSizeThreshold("inline-max-stacksize", cl::Hidden, cl::init(std::numeric_limits< size_t >::max()), cl::desc("Do not inline functions with a stack size " "that exceeds the specified limit"))
static int computeThresholdFromOptLevels(unsigned OptLevel, unsigned SizeOptLevel)
static cl::opt< uint64_t > HotCallSiteRelFreq("hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::desc("Minimum block frequency, expressed as a multiple of caller's " "entry frequency, for a callsite to be hot in the absence of " "profile information."))
static cl::opt< int > InlineSavingsProfitableMultiplier("inline-savings-profitable-multiplier", cl::Hidden, cl::init(4), cl::desc("A multiplier on top of cycle savings to decide whether the " "savings won't justify the cost"))
static cl::opt< int > MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0), cl::desc("Cost of load/store instruction when inlining"))
static cl::opt< int > ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining cold callsites"))
static cl::opt< bool > IgnoreTTIInlineCompatible("ignore-tti-inline-compatible", cl::Hidden, cl::init(false), cl::desc("Ignore TTI attributes compatibility check between callee/caller " "during inline cost calculation"))
static cl::opt< bool > OptComputeFullInlineCost("inline-cost-full", cl::Hidden, cl::desc("Compute the full inline cost of a call site even when the cost " "exceeds the threshold."))
#define DEBUG_PRINT_STAT(x)
static cl::opt< bool > InlineEnableCostBenefitAnalysis("inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false), cl::desc("Enable the cost-benefit analysis for the inliner"))
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
static cl::opt< int > InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100), cl::desc("The maximum size of a callee that get's " "inlined without sufficient cycle savings"))
static bool functionsHaveCompatibleAttributes(Function *Caller, Function *Callee, TargetTransformInfo &TTI, function_ref< const TargetLibraryInfo &(Function &)> &GetTLI)
Test that there are no attribute conflicts between Caller and Callee that prevent inlining.
#define DEBUG_TYPE
Definition: InlineCost.cpp:52
static cl::opt< int > ColdCallSiteRelFreq("cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::desc("Maximum block frequency, expressed as a percentage of caller's " "entry frequency, for a callsite to be cold in the absence of " "profile information."))
static cl::opt< bool > DisableGEPConstOperand("disable-gep-const-evaluation", cl::Hidden, cl::init(false), cl::desc("Disables evaluation of GetElementPtr with constant operands"))
static cl::opt< int > DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225), cl::desc("Default amount of inlining to perform"))
static Constant * getFalse(Type *Ty)
For a boolean type or a vector of boolean type, return false or a vector with every element false.
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:81
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
uint64_t IntrinsicInst * II
#define P(N)
FunctionAnalysisManager FAM
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getFastMathFlags(const MachineInstr &I)
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition: APInt.cpp:1543
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1089
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1199
an instruction to allocate memory on the stack
Definition: Instructions.h:61
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
virtual void emitInstructionAnnot(const Instruction *, formatted_raw_ostream &)
emitInstructionAnnot - This may be implemented to emit a string right before an instruction is emitte...
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:392
AttrKind
This enumeration lists the attributes that can be associated with parameters, function results,...
Definition: Attributes.h:86
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition: Attributes.h:203
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
bool empty() const
Definition: BasicBlock.h:470
bool hasAddressTaken() const
Returns true if there are any uses of this basic block other than direct branches,...
Definition: BasicBlock.h:658
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
This class represents a no-op cast from one type to another.
static BlockAddress * get(Function *F, BasicBlock *BB)
Return a BlockAddress for the specified function and basic block.
Definition: Constants.cpp:1871
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
std::optional< uint64_t > getBlockProfileCount(const BasicBlock *BB, bool AllowSynthetic=false) const
Returns the estimated profile count of BB.
BlockFrequency getEntryFreq() const
BlockFrequency getBlockFreq(const BasicBlock *BB) const
getblockFreq - Return block frequency.
std::optional< BlockFrequency > mul(uint64_t Factor) const
Multiplies frequency with Factor. Returns nullopt in case of overflow.
Conditional or Unconditional Branch instruction.
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
bool isUnconditional() const
Value * getCondition() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const
Determine whether the argument or parameter has the given attribute.
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1385
Attribute getFnAttr(StringRef Kind) const
Get the attribute of a given kind for the function.
Definition: InstrTypes.h:1705
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1391
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1323
Function * getCaller()
Helper to get the caller (the parent function).
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:530
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:747
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
static Constant * getSub(Constant *C1, Constant *C2, bool HasNUW=false, bool HasNSW=false)
Definition: Constants.cpp:2618
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:850
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:206
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:155
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:146
static ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:864
This is an important base class in LLVM.
Definition: Constant.h:42
bool isAllOnesValue() const
Return true if this is the value that would be returned by getAllOnesValue.
Definition: Constants.cpp:107
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool erase(const KeyT &Val)
Definition: DenseMap.h:336
unsigned size() const
Definition: DenseMap.h:99
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:146
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
This instruction extracts a struct member or array element value from an aggregate value.
Type * getReturnType() const
Definition: DerivedTypes.h:124
Class to represent profile counts.
Definition: Function.h:296
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:915
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:290
static bool compare(const APInt &LHS, const APInt &RHS, ICmpInst::Predicate Pred)
Return result of LHS Pred RHS comparison.
Indirect Branch Instruction.
Represents the cost of inlining a function.
Definition: InlineCost.h:90
static InlineCost getNever(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:131
static InlineCost getAlways(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:126
static InlineCost get(int Cost, int Threshold, int StaticBonus=0)
Definition: InlineCost.h:120
InlineResult is basically true or false.
Definition: InlineCost.h:180
static InlineResult success()
Definition: InlineCost.h:185
static InlineResult failure(const char *Reason)
Definition: InlineCost.h:186
bool isSuccess() const
Definition: InlineCost.h:189
const char * getFailureReason() const
Definition: InlineCost.h:190
This instruction inserts a struct field of array element value into an aggregate value.
Base class for instruction visitors.
Definition: InstVisitor.h:78
RetTy visitIndirectBrInst(IndirectBrInst &I)
Definition: InstVisitor.h:235
RetTy visitCmpInst(CmpInst &I)
Definition: InstVisitor.h:262
RetTy visitCallBase(CallBase &I)
Definition: InstVisitor.h:267
RetTy visitCleanupReturnInst(CleanupReturnInst &I)
Definition: InstVisitor.h:244
RetTy visitUnreachableInst(UnreachableInst &I)
Definition: InstVisitor.h:241
RetTy visitSwitchInst(SwitchInst &I)
Definition: InstVisitor.h:232
void visit(Iterator Start, Iterator End)
Definition: InstVisitor.h:87
RetTy visitReturnInst(ReturnInst &I)
Definition: InstVisitor.h:226
RetTy visitBinaryOperator(BinaryOperator &I)
Definition: InstVisitor.h:261
RetTy visitResumeInst(ResumeInst &I)
Definition: InstVisitor.h:238
RetTy visitCatchReturnInst(CatchReturnInst &I)
Definition: InstVisitor.h:247
RetTy visitCastInst(CastInst &I)
Definition: InstVisitor.h:259
RetTy visitBranchInst(BranchInst &I)
Definition: InstVisitor.h:229
RetTy visitSelectInst(SelectInst &I)
Definition: InstVisitor.h:189
void visitInstruction(Instruction &I)
Definition: InstVisitor.h:280
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:381
This class represents a cast from an integer to a pointer.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
Invoke instruction.
An instruction for reading from memory.
Definition: Instructions.h:174
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Class to represent pointers.
Definition: DerivedTypes.h:646
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:679
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
Analysis providing profile information.
This class represents a cast from a pointer to an integer.
Resume the propagation of an exception.
Return a value (possibly void), from a function.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
size_type size() const
Definition: SmallPtrSet.h:95
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:346
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:435
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:367
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:502
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
void reserve(size_type N)
Definition: SmallVector.h:676
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:455
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition: DataLayout.h:571
TypeSize getElementOffset(unsigned Idx) const
Definition: DataLayout.h:600
Class to represent struct types.
Definition: DerivedTypes.h:216
Multiway switch.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
Returns a penalty for invoking call Call in F.
unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
@ TCK_SizeAndLatency
The weighted sum of size and latency.
unsigned adjustInliningThreshold(const CallBase *CB) const
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
bool isLoweredToCall(const Function *F) const
Test whether calls to a function lower to actual program function calls.
unsigned getInliningThresholdMultiplier() const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getFPOpCost(Type *Ty) const
Return the expected cost of supporting the floating point operation of the specified type.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
formatted_raw_ostream - A raw_ostream that wraps another one and keeps track of line and column posit...
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
bool areInlineCompatible(const Function &Caller, const Function &Callee)
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
const int ColdccPenalty
Definition: InlineCost.h:51
const char FunctionInlineCostMultiplierAttributeName[]
Definition: InlineCost.h:59
const int OptSizeThreshold
Use when optsize (-Os) is specified.
Definition: InlineCost.h:38
const int OptMinSizeThreshold
Use when minsize (-Oz) is specified.
Definition: InlineCost.h:41
const int LoopPenalty
Definition: InlineCost.h:49
const uint64_t MaxSimplifiedDynamicAllocaToInline
Do not inline dynamic allocas that have been constant propagated to be static allocas above this amou...
Definition: InlineCost.h:57
const int IndirectCallThreshold
Definition: InlineCost.h:48
const int OptAggressiveThreshold
Use when -O3 is specified.
Definition: InlineCost.h:44
const char MaxInlineStackSizeAttributeName[]
Definition: InlineCost.h:62
const int LastCallToStaticBonus
Definition: InlineCost.h:50
const unsigned TotalAllocaSizeRecursiveCaller
Do not inline functions which allocate this many bytes on the stack when the caller is recursive.
Definition: InlineCost.h:54
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
@ Dead
Unused definition.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:480
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
Constant * ConstantFoldSelectInstruction(Constant *Cond, Constant *V1, Constant *V2)
Attempt to constant fold a select instruction with the specified operands.
bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
bool canConstantFoldCallTo(const CallBase *Call, const Function *F)
canConstantFoldCallTo - Return true if its even possible to fold a call to the specified function.
Function::ProfileCount ProfileCount
std::optional< int > getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind)
Definition: InlineCost.cpp:186
auto successors(const MachineBasicBlock *BB)
Value * lowerObjectSizeCall(IntrinsicInst *ObjectSize, const DataLayout &DL, const TargetLibraryInfo *TLI, bool MustSucceed)
Try to turn a call to @llvm.objectsize into an integer value of the given Type.
std::optional< InlineCostFeatures > getInliningCostFeatures(CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get the expanded cost features.
LogicalResult failure(bool IsFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:61
gep_type_iterator gep_type_end(const User *GEP)
Constant * ConstantFoldCall(const CallBase *Call, Function *F, ArrayRef< Constant * > Operands, const TargetLibraryInfo *TLI=nullptr, bool AllowNonDeterministic=true)
ConstantFoldCall - Attempt to constant fold a call to the specified function with the specified argum...
Value * simplifyInstruction(Instruction *I, const SimplifyQuery &Q)
See if we can compute a simplified version of this instruction.
InlineResult isInlineViable(Function &Callee)
Minimal filter to detect invalid constructs for inlining.
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
Value * simplifyFNegInst(Value *Op, FastMathFlags FMF, const SimplifyQuery &Q)
Given operand for an FNeg, fold the result or return null.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
std::enable_if_t< std::is_unsigned_v< T >, T > SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed=nullptr)
Multiply two unsigned integers, X and Y, and add the unsigned integer, A to the product.
Definition: MathExtras.h:684
Constant * ConstantFoldInstOperands(Instruction *I, ArrayRef< Constant * > Ops, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, bool AllowNonDeterministic=true)
ConstantFoldInstOperands - Attempt to constant fold an instruction with the specified operands.
std::array< int, static_cast< size_t >(InlineCostFeatureIndex::NumberOfFeatures)> InlineCostFeatures
TargetTransformInfo TTI
std::optional< InlineResult > getAttributeBasedInliningDecision(CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, function_ref< const TargetLibraryInfo &(Function &)> GetTLI)
Returns InlineResult::success() if the call site should be always inlined because of user directives,...
Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
int getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call, const DataLayout &DL)
Return the cost associated with a callsite, including parameter passing and the call/return instructi...
std::optional< int > getInliningCostEstimate(CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get the cost estimate ignoring thresholds.
gep_type_iterator gep_type_begin(const User *GEP)
auto predecessors(const MachineBasicBlock *BB)
std::enable_if_t< std::is_unsigned_v< T >, T > SaturatingAdd(T X, T Y, bool *ResultOverflowed=nullptr)
Add two unsigned integers, X and Y, of type T.
Definition: MathExtras.h:609
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:206
std::optional< int > OptMinSizeThreshold
Threshold to use when the caller is optimized for minsize.
Definition: InlineCost.h:220
std::optional< int > OptSizeThreshold
Threshold to use when the caller is optimized for size.
Definition: InlineCost.h:217
std::optional< int > ColdCallSiteThreshold
Threshold to use when the callsite is considered cold.
Definition: InlineCost.h:230
std::optional< int > ColdThreshold
Threshold to use for cold callees.
Definition: InlineCost.h:214
std::optional< int > HotCallSiteThreshold
Threshold to use when the callsite is considered hot.
Definition: InlineCost.h:223
std::optional< bool > AllowRecursiveCall
Indicate whether we allow inlining for recursive call.
Definition: InlineCost.h:239
int DefaultThreshold
The default threshold to start with for a callee.
Definition: InlineCost.h:208
std::optional< int > HintThreshold
Threshold to use for callees with inline hint.
Definition: InlineCost.h:211
std::optional< int > LocallyHotCallSiteThreshold
Threshold to use when the callsite is considered hot relative to function entry.
Definition: InlineCost.h:227