LLVM 20.0.0git
InlineCost.cpp
Go to the documentation of this file.
1//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements inline cost analysis.
10//
11//===----------------------------------------------------------------------===//
12
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SetVector.h"
18#include "llvm/ADT/Statistic.h"
31#include "llvm/Config/llvm-config.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/DataLayout.h"
35#include "llvm/IR/Dominators.h"
37#include "llvm/IR/GlobalAlias.h"
38#include "llvm/IR/InstVisitor.h"
40#include "llvm/IR/Operator.h"
43#include "llvm/Support/Debug.h"
46#include <climits>
47#include <limits>
48#include <optional>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "inline-cost"
53
54STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
55
56static cl::opt<int>
57 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
58 cl::desc("Default amount of inlining to perform"));
59
60// We introduce this option since there is a minor compile-time win by avoiding
61// addition of TTI attributes (target-features in particular) to inline
62// candidates when they are guaranteed to be the same as top level methods in
63// some use cases. If we avoid adding the attribute, we need an option to avoid
64// checking these attributes.
66 "ignore-tti-inline-compatible", cl::Hidden, cl::init(false),
67 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
68 "during inline cost calculation"));
69
71 "print-instruction-comments", cl::Hidden, cl::init(false),
72 cl::desc("Prints comments for instruction based on inline cost analysis"));
73
75 "inline-threshold", cl::Hidden, cl::init(225),
76 cl::desc("Control the amount of inlining to perform (default = 225)"));
77
79 "inlinehint-threshold", cl::Hidden, cl::init(325),
80 cl::desc("Threshold for inlining functions with inline hint"));
81
82static cl::opt<int>
83 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
84 cl::init(45),
85 cl::desc("Threshold for inlining cold callsites"));
86
88 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
89 cl::desc("Enable the cost-benefit analysis for the inliner"));
90
91// InlineSavingsMultiplier overrides per TTI multipliers iff it is
92// specified explicitly in command line options. This option is exposed
93// for tuning and testing.
95 "inline-savings-multiplier", cl::Hidden, cl::init(8),
96 cl::desc("Multiplier to multiply cycle savings by during inlining"));
97
98// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
99// specified explicitly in command line options. This option is exposed
100// for tuning and testing.
102 "inline-savings-profitable-multiplier", cl::Hidden, cl::init(4),
103 cl::desc("A multiplier on top of cycle savings to decide whether the "
104 "savings won't justify the cost"));
105
106static cl::opt<int>
107 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
108 cl::desc("The maximum size of a callee that get's "
109 "inlined without sufficient cycle savings"));
110
111// We introduce this threshold to help performance of instrumentation based
112// PGO before we actually hook up inliner with analysis passes such as BPI and
113// BFI.
115 "inlinecold-threshold", cl::Hidden, cl::init(45),
116 cl::desc("Threshold for inlining functions with cold attribute"));
117
118static cl::opt<int>
119 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
120 cl::desc("Threshold for hot callsites "));
121
123 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525),
124 cl::desc("Threshold for locally hot callsites "));
125
127 "cold-callsite-rel-freq", cl::Hidden, cl::init(2),
128 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
129 "entry frequency, for a callsite to be cold in the absence of "
130 "profile information."));
131
133 "hot-callsite-rel-freq", cl::Hidden, cl::init(60),
134 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
135 "entry frequency, for a callsite to be hot in the absence of "
136 "profile information."));
137
138static cl::opt<int>
139 InstrCost("inline-instr-cost", cl::Hidden, cl::init(5),
140 cl::desc("Cost of a single instruction when inlining"));
141
142static cl::opt<int>
143 MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0),
144 cl::desc("Cost of load/store instruction when inlining"));
145
147 "inline-call-penalty", cl::Hidden, cl::init(25),
148 cl::desc("Call penalty that is applied per callsite when inlining"));
149
150static cl::opt<size_t>
151 StackSizeThreshold("inline-max-stacksize", cl::Hidden,
152 cl::init(std::numeric_limits<size_t>::max()),
153 cl::desc("Do not inline functions with a stack size "
154 "that exceeds the specified limit"));
155
157 "recursive-inline-max-stacksize", cl::Hidden,
159 cl::desc("Do not inline recursive functions with a stack "
160 "size that exceeds the specified limit"));
161
163 "inline-cost-full", cl::Hidden,
164 cl::desc("Compute the full inline cost of a call site even when the cost "
165 "exceeds the threshold."));
166
168 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
169 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
170 "attributes."));
171
173 "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
174 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
175
176namespace llvm {
177std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
178 if (Attr.isValid()) {
179 int AttrValue = 0;
180 if (!Attr.getValueAsString().getAsInteger(10, AttrValue))
181 return AttrValue;
182 }
183 return std::nullopt;
184}
185
186std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
187 return getStringFnAttrAsInt(CB.getFnAttr(AttrKind));
188}
189
190std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
191 return getStringFnAttrAsInt(F->getFnAttribute(AttrKind));
192}
193
194namespace InlineConstants {
195int getInstrCost() { return InstrCost; }
196
197} // namespace InlineConstants
198
199} // namespace llvm
200
201namespace {
202class InlineCostCallAnalyzer;
203
204// This struct is used to store information about inline cost of a
205// particular instruction
206struct InstructionCostDetail {
207 int CostBefore = 0;
208 int CostAfter = 0;
209 int ThresholdBefore = 0;
210 int ThresholdAfter = 0;
211
212 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
213
214 int getCostDelta() const { return CostAfter - CostBefore; }
215
216 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
217};
218
219class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
220private:
221 InlineCostCallAnalyzer *const ICCA;
222
223public:
224 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
226 formatted_raw_ostream &OS) override;
227};
228
229/// Carry out call site analysis, in order to evaluate inlinability.
230/// NOTE: the type is currently used as implementation detail of functions such
231/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
232/// expectation is that they come from the outer scope, from the wrapper
233/// functions. If we want to support constructing CallAnalyzer objects where
234/// lambdas are provided inline at construction, or where the object needs to
235/// otherwise survive past the scope of the provided functions, we need to
236/// revisit the argument types.
237class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
239 friend class InstVisitor<CallAnalyzer, bool>;
240
241protected:
242 virtual ~CallAnalyzer() = default;
243 /// The TargetTransformInfo available for this compilation.
245
246 /// Getter for the cache of @llvm.assume intrinsics.
247 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
248
249 /// Getter for BlockFrequencyInfo
251
252 /// Getter for TargetLibraryInfo
253 function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
254
255 /// Profile summary information.
257
258 /// The called function.
259 Function &F;
260
261 // Cache the DataLayout since we use it a lot.
262 const DataLayout &DL;
263
264 /// The OptimizationRemarkEmitter available for this compilation.
266
267 /// The candidate callsite being analyzed. Please do not use this to do
268 /// analysis in the caller function; we want the inline cost query to be
269 /// easily cacheable. Instead, use the cover function paramHasAttr.
270 CallBase &CandidateCall;
271
272 /// Extension points for handling callsite features.
273 // Called before a basic block was analyzed.
274 virtual void onBlockStart(const BasicBlock *BB) {}
275
276 /// Called after a basic block was analyzed.
277 virtual void onBlockAnalyzed(const BasicBlock *BB) {}
278
279 /// Called before an instruction was analyzed
280 virtual void onInstructionAnalysisStart(const Instruction *I) {}
281
282 /// Called after an instruction was analyzed
283 virtual void onInstructionAnalysisFinish(const Instruction *I) {}
284
285 /// Called at the end of the analysis of the callsite. Return the outcome of
286 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
287 /// the reason it can't.
288 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
289 /// Called when we're about to start processing a basic block, and every time
290 /// we are done processing an instruction. Return true if there is no point in
291 /// continuing the analysis (e.g. we've determined already the call site is
292 /// too expensive to inline)
293 virtual bool shouldStop() { return false; }
294
295 /// Called before the analysis of the callee body starts (with callsite
296 /// contexts propagated). It checks callsite-specific information. Return a
297 /// reason analysis can't continue if that's the case, or 'true' if it may
298 /// continue.
299 virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
300 /// Called if the analysis engine decides SROA cannot be done for the given
301 /// alloca.
302 virtual void onDisableSROA(AllocaInst *Arg) {}
303
304 /// Called the analysis engine determines load elimination won't happen.
305 virtual void onDisableLoadElimination() {}
306
307 /// Called when we visit a CallBase, before the analysis starts. Return false
308 /// to stop further processing of the instruction.
309 virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
310
311 /// Called to account for a call.
312 virtual void onCallPenalty() {}
313
314 /// Called to account for a load or store.
315 virtual void onMemAccess(){};
316
317 /// Called to account for the expectation the inlining would result in a load
318 /// elimination.
319 virtual void onLoadEliminationOpportunity() {}
320
321 /// Called to account for the cost of argument setup for the Call in the
322 /// callee's body (not the callsite currently under analysis).
323 virtual void onCallArgumentSetup(const CallBase &Call) {}
324
325 /// Called to account for a load relative intrinsic.
326 virtual void onLoadRelativeIntrinsic() {}
327
328 /// Called to account for a lowered call.
329 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
330 }
331
332 /// Account for a jump table of given size. Return false to stop further
333 /// processing the switch instruction
334 virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
335
336 /// Account for a case cluster of given size. Return false to stop further
337 /// processing of the instruction.
338 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
339
340 /// Called at the end of processing a switch instruction, with the given
341 /// number of case clusters.
342 virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
343 bool DefaultDestUndefined) {}
344
345 /// Called to account for any other instruction not specifically accounted
346 /// for.
347 virtual void onMissedSimplification() {}
348
349 /// Start accounting potential benefits due to SROA for the given alloca.
350 virtual void onInitializeSROAArg(AllocaInst *Arg) {}
351
352 /// Account SROA savings for the AllocaInst value.
353 virtual void onAggregateSROAUse(AllocaInst *V) {}
354
355 bool handleSROA(Value *V, bool DoNotDisable) {
356 // Check for SROA candidates in comparisons.
357 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
358 if (DoNotDisable) {
359 onAggregateSROAUse(SROAArg);
360 return true;
361 }
362 disableSROAForArg(SROAArg);
363 }
364 return false;
365 }
366
367 bool IsCallerRecursive = false;
368 bool IsRecursiveCall = false;
369 bool ExposesReturnsTwice = false;
370 bool HasDynamicAlloca = false;
371 bool ContainsNoDuplicateCall = false;
372 bool HasReturn = false;
373 bool HasIndirectBr = false;
374 bool HasUninlineableIntrinsic = false;
375 bool InitsVargArgs = false;
376
377 /// Number of bytes allocated statically by the callee.
378 uint64_t AllocatedSize = 0;
379 unsigned NumInstructions = 0;
380 unsigned NumVectorInstructions = 0;
381
382 /// While we walk the potentially-inlined instructions, we build up and
383 /// maintain a mapping of simplified values specific to this callsite. The
384 /// idea is to propagate any special information we have about arguments to
385 /// this call through the inlinable section of the function, and account for
386 /// likely simplifications post-inlining. The most important aspect we track
387 /// is CFG altering simplifications -- when we prove a basic block dead, that
388 /// can cause dramatic shifts in the cost of inlining a function.
389 DenseMap<Value *, Constant *> SimplifiedValues;
390
391 /// Keep track of the values which map back (through function arguments) to
392 /// allocas on the caller stack which could be simplified through SROA.
394
395 /// Keep track of Allocas for which we believe we may get SROA optimization.
396 DenseSet<AllocaInst *> EnabledSROAAllocas;
397
398 /// Keep track of values which map to a pointer base and constant offset.
400
401 /// Keep track of dead blocks due to the constant arguments.
403
404 /// The mapping of the blocks to their known unique successors due to the
405 /// constant arguments.
407
408 /// Model the elimination of repeated loads that is expected to happen
409 /// whenever we simplify away the stores that would otherwise cause them to be
410 /// loads.
411 bool EnableLoadElimination = true;
412
413 /// Whether we allow inlining for recursive call.
414 bool AllowRecursiveCall = false;
415
416 SmallPtrSet<Value *, 16> LoadAddrSet;
417
418 AllocaInst *getSROAArgForValueOrNull(Value *V) const {
419 auto It = SROAArgValues.find(V);
420 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
421 return nullptr;
422 return It->second;
423 }
424
425 /// Use a value in its given form directly if possible, otherwise try looking
426 /// for it in SimplifiedValues.
427 template <typename T> T *getDirectOrSimplifiedValue(Value *V) const {
428 if (auto *Direct = dyn_cast<T>(V))
429 return Direct;
430 return dyn_cast_if_present<T>(SimplifiedValues.lookup(V));
431 }
432
433 // Custom simplification helper routines.
434 bool isAllocaDerivedArg(Value *V);
435 void disableSROAForArg(AllocaInst *SROAArg);
436 void disableSROA(Value *V);
437 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
438 void disableLoadElimination();
439 bool isGEPFree(GetElementPtrInst &GEP);
440 bool canFoldInboundsGEP(GetElementPtrInst &I);
441 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
442 bool simplifyCallSite(Function *F, CallBase &Call);
444 bool simplifyIntrinsicCallIsConstant(CallBase &CB);
445 bool simplifyIntrinsicCallObjectSize(CallBase &CB);
446 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
447 bool isLoweredToCall(Function *F, CallBase &Call);
448
449 /// Return true if the given argument to the function being considered for
450 /// inlining has the given attribute set either at the call site or the
451 /// function declaration. Primarily used to inspect call site specific
452 /// attributes since these can be more precise than the ones on the callee
453 /// itself.
454 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
455
456 /// Return true if the given value is known non null within the callee if
457 /// inlined through this particular callsite.
458 bool isKnownNonNullInCallee(Value *V);
459
460 /// Return true if size growth is allowed when inlining the callee at \p Call.
461 bool allowSizeGrowth(CallBase &Call);
462
463 // Custom analysis routines.
464 InlineResult analyzeBlock(BasicBlock *BB,
466
467 // Disable several entry points to the visitor so we don't accidentally use
468 // them by declaring but not defining them here.
469 void visit(Module *);
470 void visit(Module &);
471 void visit(Function *);
472 void visit(Function &);
473 void visit(BasicBlock *);
474 void visit(BasicBlock &);
475
476 // Provide base case for our instruction visit.
478
479 // Our visit overrides.
480 bool visitAlloca(AllocaInst &I);
481 bool visitPHI(PHINode &I);
482 bool visitGetElementPtr(GetElementPtrInst &I);
483 bool visitBitCast(BitCastInst &I);
484 bool visitPtrToInt(PtrToIntInst &I);
485 bool visitIntToPtr(IntToPtrInst &I);
486 bool visitCastInst(CastInst &I);
487 bool visitCmpInst(CmpInst &I);
488 bool visitSub(BinaryOperator &I);
490 bool visitFNeg(UnaryOperator &I);
491 bool visitLoad(LoadInst &I);
492 bool visitStore(StoreInst &I);
493 bool visitExtractValue(ExtractValueInst &I);
494 bool visitInsertValue(InsertValueInst &I);
495 bool visitCallBase(CallBase &Call);
496 bool visitReturnInst(ReturnInst &RI);
497 bool visitBranchInst(BranchInst &BI);
498 bool visitSelectInst(SelectInst &SI);
499 bool visitSwitchInst(SwitchInst &SI);
501 bool visitResumeInst(ResumeInst &RI);
505
506public:
507 CallAnalyzer(
508 Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
509 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
510 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
511 function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
512 ProfileSummaryInfo *PSI = nullptr,
513 OptimizationRemarkEmitter *ORE = nullptr)
514 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
515 GetTLI(GetTLI), PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE),
516 CandidateCall(Call) {}
517
518 InlineResult analyze();
519
520 std::optional<Constant *> getSimplifiedValue(Instruction *I) {
521 auto It = SimplifiedValues.find(I);
522 if (It != SimplifiedValues.end())
523 return It->second;
524 return std::nullopt;
525 }
526
527 // Keep a bunch of stats about the cost savings found so we can print them
528 // out when debugging.
529 unsigned NumConstantArgs = 0;
530 unsigned NumConstantOffsetPtrArgs = 0;
531 unsigned NumAllocaArgs = 0;
532 unsigned NumConstantPtrCmps = 0;
533 unsigned NumConstantPtrDiffs = 0;
534 unsigned NumInstructionsSimplified = 0;
535
536 void dump();
537};
538
539// Considering forming a binary search, we should find the number of nodes
540// which is same as the number of comparisons when lowered. For a given
541// number of clusters, n, we can define a recursive function, f(n), to find
542// the number of nodes in the tree. The recursion is :
543// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
544// and f(n) = n, when n <= 3.
545// This will lead a binary tree where the leaf should be either f(2) or f(3)
546// when n > 3. So, the number of comparisons from leaves should be n, while
547// the number of non-leaf should be :
548// 2^(log2(n) - 1) - 1
549// = 2^log2(n) * 2^-1 - 1
550// = n / 2 - 1.
551// Considering comparisons from leaf and non-leaf nodes, we can estimate the
552// number of comparisons in a simple closed form :
553// n + n / 2 - 1 = n * 3 / 2 - 1
554int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
555 return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
556}
557
558/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
559/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
560class InlineCostCallAnalyzer final : public CallAnalyzer {
561 const bool ComputeFullInlineCost;
562 int LoadEliminationCost = 0;
563 /// Bonus to be applied when percentage of vector instructions in callee is
564 /// high (see more details in updateThreshold).
565 int VectorBonus = 0;
566 /// Bonus to be applied when the callee has only one reachable basic block.
567 int SingleBBBonus = 0;
568
569 /// Tunable parameters that control the analysis.
570 const InlineParams &Params;
571
572 // This DenseMap stores the delta change in cost and threshold after
573 // accounting for the given instruction. The map is filled only with the
574 // flag PrintInstructionComments on.
576
577 /// Upper bound for the inlining cost. Bonuses are being applied to account
578 /// for speculative "expected profit" of the inlining decision.
579 int Threshold = 0;
580
581 /// The amount of StaticBonus applied.
582 int StaticBonusApplied = 0;
583
584 /// Attempt to evaluate indirect calls to boost its inline cost.
585 const bool BoostIndirectCalls;
586
587 /// Ignore the threshold when finalizing analysis.
588 const bool IgnoreThreshold;
589
590 // True if the cost-benefit-analysis-based inliner is enabled.
591 const bool CostBenefitAnalysisEnabled;
592
593 /// Inlining cost measured in abstract units, accounts for all the
594 /// instructions expected to be executed for a given function invocation.
595 /// Instructions that are statically proven to be dead based on call-site
596 /// arguments are not counted here.
597 int Cost = 0;
598
599 // The cumulative cost at the beginning of the basic block being analyzed. At
600 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
601 // the size of that basic block.
602 int CostAtBBStart = 0;
603
604 // The static size of live but cold basic blocks. This is "static" in the
605 // sense that it's not weighted by profile counts at all.
606 int ColdSize = 0;
607
608 // Whether inlining is decided by cost-threshold analysis.
609 bool DecidedByCostThreshold = false;
610
611 // Whether inlining is decided by cost-benefit analysis.
612 bool DecidedByCostBenefit = false;
613
614 // The cost-benefit pair computed by cost-benefit analysis.
615 std::optional<CostBenefitPair> CostBenefit;
616
617 bool SingleBB = true;
618
619 unsigned SROACostSavings = 0;
620 unsigned SROACostSavingsLost = 0;
621
622 /// The mapping of caller Alloca values to their accumulated cost savings. If
623 /// we have to disable SROA for one of the allocas, this tells us how much
624 /// cost must be added.
625 DenseMap<AllocaInst *, int> SROAArgCosts;
626
627 /// Return true if \p Call is a cold callsite.
628 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
629
630 /// Update Threshold based on callsite properties such as callee
631 /// attributes and callee hotness for PGO builds. The Callee is explicitly
632 /// passed to support analyzing indirect calls whose target is inferred by
633 /// analysis.
634 void updateThreshold(CallBase &Call, Function &Callee);
635 /// Return a higher threshold if \p Call is a hot callsite.
636 std::optional<int> getHotCallSiteThreshold(CallBase &Call,
637 BlockFrequencyInfo *CallerBFI);
638
639 /// Handle a capped 'int' increment for Cost.
640 void addCost(int64_t Inc) {
641 Inc = std::clamp<int64_t>(Inc, INT_MIN, INT_MAX);
642 Cost = std::clamp<int64_t>(Inc + Cost, INT_MIN, INT_MAX);
643 }
644
645 void onDisableSROA(AllocaInst *Arg) override {
646 auto CostIt = SROAArgCosts.find(Arg);
647 if (CostIt == SROAArgCosts.end())
648 return;
649 addCost(CostIt->second);
650 SROACostSavings -= CostIt->second;
651 SROACostSavingsLost += CostIt->second;
652 SROAArgCosts.erase(CostIt);
653 }
654
655 void onDisableLoadElimination() override {
656 addCost(LoadEliminationCost);
657 LoadEliminationCost = 0;
658 }
659
660 bool onCallBaseVisitStart(CallBase &Call) override {
661 if (std::optional<int> AttrCallThresholdBonus =
662 getStringFnAttrAsInt(Call, "call-threshold-bonus"))
663 Threshold += *AttrCallThresholdBonus;
664
665 if (std::optional<int> AttrCallCost =
666 getStringFnAttrAsInt(Call, "call-inline-cost")) {
667 addCost(*AttrCallCost);
668 // Prevent further processing of the call since we want to override its
669 // inline cost, not just add to it.
670 return false;
671 }
672 return true;
673 }
674
675 void onCallPenalty() override { addCost(CallPenalty); }
676
677 void onMemAccess() override { addCost(MemAccessCost); }
678
679 void onCallArgumentSetup(const CallBase &Call) override {
680 // Pay the price of the argument setup. We account for the average 1
681 // instruction per call argument setup here.
682 addCost(Call.arg_size() * InstrCost);
683 }
684 void onLoadRelativeIntrinsic() override {
685 // This is normally lowered to 4 LLVM instructions.
686 addCost(3 * InstrCost);
687 }
688 void onLoweredCall(Function *F, CallBase &Call,
689 bool IsIndirectCall) override {
690 // We account for the average 1 instruction per call argument setup here.
691 addCost(Call.arg_size() * InstrCost);
692
693 // If we have a constant that we are calling as a function, we can peer
694 // through it and see the function target. This happens not infrequently
695 // during devirtualization and so we want to give it a hefty bonus for
696 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
697 // Pretend to inline the function, with a custom threshold.
698 if (IsIndirectCall && BoostIndirectCalls) {
699 auto IndirectCallParams = Params;
700 IndirectCallParams.DefaultThreshold =
702 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
703 /// to instantiate the derived class.
704 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
705 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
706 false);
707 if (CA.analyze().isSuccess()) {
708 // We were able to inline the indirect call! Subtract the cost from the
709 // threshold to get the bonus we want to apply, but don't go below zero.
710 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
711 }
712 } else
713 // Otherwise simply add the cost for merely making the call.
714 addCost(TTI.getInlineCallPenalty(CandidateCall.getCaller(), Call,
715 CallPenalty));
716 }
717
718 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
719 bool DefaultDestUndefined) override {
720 // If suitable for a jump table, consider the cost for the table size and
721 // branch to destination.
722 // Maximum valid cost increased in this function.
723 if (JumpTableSize) {
724 // Suppose a default branch includes one compare and one conditional
725 // branch if it's reachable.
726 if (!DefaultDestUndefined)
727 addCost(2 * InstrCost);
728 // Suppose a jump table requires one load and one jump instruction.
729 int64_t JTCost =
730 static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost;
731 addCost(JTCost);
732 return;
733 }
734
735 if (NumCaseCluster <= 3) {
736 // Suppose a comparison includes one compare and one conditional branch.
737 // We can reduce a set of instructions if the default branch is
738 // undefined.
739 addCost((NumCaseCluster - DefaultDestUndefined) * 2 * InstrCost);
740 return;
741 }
742
743 int64_t ExpectedNumberOfCompare =
744 getExpectedNumberOfCompare(NumCaseCluster);
745 int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
746
747 addCost(SwitchCost);
748 }
749 void onMissedSimplification() override { addCost(InstrCost); }
750
751 void onInitializeSROAArg(AllocaInst *Arg) override {
752 assert(Arg != nullptr &&
753 "Should not initialize SROA costs for null value.");
754 auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
755 SROACostSavings += SROAArgCost;
756 SROAArgCosts[Arg] = SROAArgCost;
757 }
758
759 void onAggregateSROAUse(AllocaInst *SROAArg) override {
760 auto CostIt = SROAArgCosts.find(SROAArg);
761 assert(CostIt != SROAArgCosts.end() &&
762 "expected this argument to have a cost");
763 CostIt->second += InstrCost;
764 SROACostSavings += InstrCost;
765 }
766
767 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
768
769 void onBlockAnalyzed(const BasicBlock *BB) override {
770 if (CostBenefitAnalysisEnabled) {
771 // Keep track of the static size of live but cold basic blocks. For now,
772 // we define a cold basic block to be one that's never executed.
773 assert(GetBFI && "GetBFI must be available");
774 BlockFrequencyInfo *BFI = &(GetBFI(F));
775 assert(BFI && "BFI must be available");
776 auto ProfileCount = BFI->getBlockProfileCount(BB);
777 if (*ProfileCount == 0)
778 ColdSize += Cost - CostAtBBStart;
779 }
780
781 auto *TI = BB->getTerminator();
782 // If we had any successors at this point, than post-inlining is likely to
783 // have them as well. Note that we assume any basic blocks which existed
784 // due to branches or switches which folded above will also fold after
785 // inlining.
786 if (SingleBB && TI->getNumSuccessors() > 1) {
787 // Take off the bonus we applied to the threshold.
788 Threshold -= SingleBBBonus;
789 SingleBB = false;
790 }
791 }
792
793 void onInstructionAnalysisStart(const Instruction *I) override {
794 // This function is called to store the initial cost of inlining before
795 // the given instruction was assessed.
797 return;
798 InstructionCostDetailMap[I].CostBefore = Cost;
799 InstructionCostDetailMap[I].ThresholdBefore = Threshold;
800 }
801
802 void onInstructionAnalysisFinish(const Instruction *I) override {
803 // This function is called to find new values of cost and threshold after
804 // the instruction has been assessed.
806 return;
807 InstructionCostDetailMap[I].CostAfter = Cost;
808 InstructionCostDetailMap[I].ThresholdAfter = Threshold;
809 }
810
811 bool isCostBenefitAnalysisEnabled() {
812 if (!PSI || !PSI->hasProfileSummary())
813 return false;
814
815 if (!GetBFI)
816 return false;
817
818 if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
819 // Honor the explicit request from the user.
821 return false;
822 } else {
823 // Otherwise, require instrumentation profile.
824 if (!PSI->hasInstrumentationProfile())
825 return false;
826 }
827
828 auto *Caller = CandidateCall.getParent()->getParent();
829 if (!Caller->getEntryCount())
830 return false;
831
832 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
833 if (!CallerBFI)
834 return false;
835
836 // For now, limit to hot call site.
837 if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
838 return false;
839
840 // Make sure we have a nonzero entry count.
841 auto EntryCount = F.getEntryCount();
842 if (!EntryCount || !EntryCount->getCount())
843 return false;
844
845 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
846 if (!CalleeBFI)
847 return false;
848
849 return true;
850 }
851
852 // A helper function to choose between command line override and default.
853 unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
854 if (InlineSavingsMultiplier.getNumOccurrences())
857 }
858
859 // A helper function to choose between command line override and default.
860 unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
861 if (InlineSavingsProfitableMultiplier.getNumOccurrences())
864 }
865
866 void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
867 if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
868 CandidateCall, "inline-cycle-savings-for-test")) {
869 CycleSavings = *AttrCycleSavings;
870 }
871
872 if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
873 CandidateCall, "inline-runtime-cost-for-test")) {
874 Size = *AttrRuntimeCost;
875 }
876 }
877
878 // Determine whether we should inline the given call site, taking into account
879 // both the size cost and the cycle savings. Return std::nullopt if we don't
880 // have sufficient profiling information to determine.
881 std::optional<bool> costBenefitAnalysis() {
882 if (!CostBenefitAnalysisEnabled)
883 return std::nullopt;
884
885 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
886 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
887 // falling back to the cost-based metric.
888 // TODO: Improve this hacky condition.
889 if (Threshold == 0)
890 return std::nullopt;
891
892 assert(GetBFI);
893 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
894 assert(CalleeBFI);
895
896 // The cycle savings expressed as the sum of InstrCost
897 // multiplied by the estimated dynamic count of each instruction we can
898 // avoid. Savings come from the call site cost, such as argument setup and
899 // the call instruction, as well as the instructions that are folded.
900 //
901 // We use 128-bit APInt here to avoid potential overflow. This variable
902 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
903 // assumes that we can avoid or fold a billion instructions, each with a
904 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
905 // period on a 4GHz machine.
906 APInt CycleSavings(128, 0);
907
908 for (auto &BB : F) {
909 APInt CurrentSavings(128, 0);
910 for (auto &I : BB) {
911 if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
912 // Count a conditional branch as savings if it becomes unconditional.
913 if (BI->isConditional() &&
914 isa_and_nonnull<ConstantInt>(
915 SimplifiedValues.lookup(BI->getCondition()))) {
916 CurrentSavings += InstrCost;
917 }
918 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
919 if (isa_and_present<ConstantInt>(SimplifiedValues.lookup(SI->getCondition())))
920 CurrentSavings += InstrCost;
921 } else if (Value *V = dyn_cast<Value>(&I)) {
922 // Count an instruction as savings if we can fold it.
923 if (SimplifiedValues.count(V)) {
924 CurrentSavings += InstrCost;
925 }
926 }
927 }
928
929 auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
930 CurrentSavings *= *ProfileCount;
931 CycleSavings += CurrentSavings;
932 }
933
934 // Compute the cycle savings per call.
935 auto EntryProfileCount = F.getEntryCount();
936 assert(EntryProfileCount && EntryProfileCount->getCount());
937 auto EntryCount = EntryProfileCount->getCount();
938 CycleSavings += EntryCount / 2;
939 CycleSavings = CycleSavings.udiv(EntryCount);
940
941 // Compute the total savings for the call site.
942 auto *CallerBB = CandidateCall.getParent();
943 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
944 CycleSavings += getCallsiteCost(TTI, this->CandidateCall, DL);
945 CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
946
947 // Remove the cost of the cold basic blocks to model the runtime cost more
948 // accurately. Both machine block placement and function splitting could
949 // place cold blocks further from hot blocks.
950 int Size = Cost - ColdSize;
951
952 // Allow tiny callees to be inlined regardless of whether they meet the
953 // savings threshold.
955
956 OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
957 CostBenefit.emplace(APInt(128, Size), CycleSavings);
958
959 // Let R be the ratio of CycleSavings to Size. We accept the inlining
960 // opportunity if R is really high and reject if R is really low. If R is
961 // somewhere in the middle, we fall back to the cost-based analysis.
962 //
963 // Specifically, let R = CycleSavings / Size, we accept the inlining
964 // opportunity if:
965 //
966 // PSI->getOrCompHotCountThreshold()
967 // R > -------------------------------------------------
968 // getInliningCostBenefitAnalysisSavingsMultiplier()
969 //
970 // and reject the inlining opportunity if:
971 //
972 // PSI->getOrCompHotCountThreshold()
973 // R <= ----------------------------------------------------
974 // getInliningCostBenefitAnalysisProfitableMultiplier()
975 //
976 // Otherwise, we fall back to the cost-based analysis.
977 //
978 // Implementation-wise, use multiplication (CycleSavings * Multiplier,
979 // HotCountThreshold * Size) rather than division to avoid precision loss.
980 APInt Threshold(128, PSI->getOrCompHotCountThreshold());
981 Threshold *= Size;
982
983 APInt UpperBoundCycleSavings = CycleSavings;
984 UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
985 if (UpperBoundCycleSavings.uge(Threshold))
986 return true;
987
988 APInt LowerBoundCycleSavings = CycleSavings;
989 LowerBoundCycleSavings *=
990 getInliningCostBenefitAnalysisProfitableMultiplier();
991 if (LowerBoundCycleSavings.ult(Threshold))
992 return false;
993
994 // Otherwise, fall back to the cost-based analysis.
995 return std::nullopt;
996 }
997
998 InlineResult finalizeAnalysis() override {
999 // Loops generally act a lot like calls in that they act like barriers to
1000 // movement, require a certain amount of setup, etc. So when optimising for
1001 // size, we penalise any call sites that perform loops. We do this after all
1002 // other costs here, so will likely only be dealing with relatively small
1003 // functions (and hence DT and LI will hopefully be cheap).
1004 auto *Caller = CandidateCall.getFunction();
1005 if (Caller->hasMinSize()) {
1006 DominatorTree DT(F);
1007 LoopInfo LI(DT);
1008 int NumLoops = 0;
1009 for (Loop *L : LI) {
1010 // Ignore loops that will not be executed
1011 if (DeadBlocks.count(L->getHeader()))
1012 continue;
1013 NumLoops++;
1014 }
1015 addCost(NumLoops * InlineConstants::LoopPenalty);
1016 }
1017
1018 // We applied the maximum possible vector bonus at the beginning. Now,
1019 // subtract the excess bonus, if any, from the Threshold before
1020 // comparing against Cost.
1021 if (NumVectorInstructions <= NumInstructions / 10)
1022 Threshold -= VectorBonus;
1023 else if (NumVectorInstructions <= NumInstructions / 2)
1024 Threshold -= VectorBonus / 2;
1025
1026 if (std::optional<int> AttrCost =
1027 getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
1028 Cost = *AttrCost;
1029
1030 if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1031 CandidateCall,
1033 Cost *= *AttrCostMult;
1034
1035 if (std::optional<int> AttrThreshold =
1036 getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
1037 Threshold = *AttrThreshold;
1038
1039 if (auto Result = costBenefitAnalysis()) {
1040 DecidedByCostBenefit = true;
1041 if (*Result)
1042 return InlineResult::success();
1043 else
1044 return InlineResult::failure("Cost over threshold.");
1045 }
1046
1047 if (IgnoreThreshold)
1048 return InlineResult::success();
1049
1050 DecidedByCostThreshold = true;
1051 return Cost < std::max(1, Threshold)
1053 : InlineResult::failure("Cost over threshold.");
1054 }
1055
1056 bool shouldStop() override {
1057 if (IgnoreThreshold || ComputeFullInlineCost)
1058 return false;
1059 // Bail out the moment we cross the threshold. This means we'll under-count
1060 // the cost, but only when undercounting doesn't matter.
1061 if (Cost < Threshold)
1062 return false;
1063 DecidedByCostThreshold = true;
1064 return true;
1065 }
1066
1067 void onLoadEliminationOpportunity() override {
1068 LoadEliminationCost += InstrCost;
1069 }
1070
1071 InlineResult onAnalysisStart() override {
1072 // Perform some tweaks to the cost and threshold based on the direct
1073 // callsite information.
1074
1075 // We want to more aggressively inline vector-dense kernels, so up the
1076 // threshold, and we'll lower it if the % of vector instructions gets too
1077 // low. Note that these bonuses are some what arbitrary and evolved over
1078 // time by accident as much as because they are principled bonuses.
1079 //
1080 // FIXME: It would be nice to remove all such bonuses. At least it would be
1081 // nice to base the bonus values on something more scientific.
1082 assert(NumInstructions == 0);
1083 assert(NumVectorInstructions == 0);
1084
1085 // Update the threshold based on callsite properties
1086 updateThreshold(CandidateCall, F);
1087
1088 // While Threshold depends on commandline options that can take negative
1089 // values, we want to enforce the invariant that the computed threshold and
1090 // bonuses are non-negative.
1091 assert(Threshold >= 0);
1092 assert(SingleBBBonus >= 0);
1093 assert(VectorBonus >= 0);
1094
1095 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1096 // this Threshold any time, and cost cannot decrease, we can stop processing
1097 // the rest of the function body.
1098 Threshold += (SingleBBBonus + VectorBonus);
1099
1100 // Give out bonuses for the callsite, as the instructions setting them up
1101 // will be gone after inlining.
1102 addCost(-getCallsiteCost(TTI, this->CandidateCall, DL));
1103
1104 // If this function uses the coldcc calling convention, prefer not to inline
1105 // it.
1106 if (F.getCallingConv() == CallingConv::Cold)
1108
1109 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1110
1111 // Check if we're done. This can happen due to bonuses and penalties.
1112 if (Cost >= Threshold && !ComputeFullInlineCost)
1113 return InlineResult::failure("high cost");
1114
1115 return InlineResult::success();
1116 }
1117
1118public:
1119 InlineCostCallAnalyzer(
1120 Function &Callee, CallBase &Call, const InlineParams &Params,
1121 const TargetTransformInfo &TTI,
1122 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1123 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1124 function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
1125 ProfileSummaryInfo *PSI = nullptr,
1126 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1127 bool IgnoreThreshold = false)
1128 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, PSI,
1129 ORE),
1130 ComputeFullInlineCost(OptComputeFullInlineCost ||
1131 Params.ComputeFullInlineCost || ORE ||
1132 isCostBenefitAnalysisEnabled()),
1133 Params(Params), Threshold(Params.DefaultThreshold),
1134 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1135 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1136 Writer(this) {
1137 AllowRecursiveCall = *Params.AllowRecursiveCall;
1138 }
1139
1140 /// Annotation Writer for instruction details
1141 InlineCostAnnotationWriter Writer;
1142
1143 void dump();
1144
1145 // Prints the same analysis as dump(), but its definition is not dependent
1146 // on the build.
1147 void print(raw_ostream &OS);
1148
1149 std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1150 auto It = InstructionCostDetailMap.find(I);
1151 if (It != InstructionCostDetailMap.end())
1152 return It->second;
1153 return std::nullopt;
1154 }
1155
1156 virtual ~InlineCostCallAnalyzer() = default;
1157 int getThreshold() const { return Threshold; }
1158 int getCost() const { return Cost; }
1159 int getStaticBonusApplied() const { return StaticBonusApplied; }
1160 std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1161 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1162 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1163};
1164
1165// Return true if CB is the sole call to local function Callee.
1166static bool isSoleCallToLocalFunction(const CallBase &CB,
1167 const Function &Callee) {
1168 return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1169 &Callee == CB.getCalledFunction();
1170}
1171
1172class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1173private:
1175
1176 // FIXME: These constants are taken from the heuristic-based cost visitor.
1177 // These should be removed entirely in a later revision to avoid reliance on
1178 // heuristics in the ML inliner.
1179 static constexpr int JTCostMultiplier = 2;
1180 static constexpr int CaseClusterCostMultiplier = 2;
1181 static constexpr int SwitchDefaultDestCostMultiplier = 2;
1182 static constexpr int SwitchCostMultiplier = 2;
1183
1184 // FIXME: These are taken from the heuristic-based cost visitor: we should
1185 // eventually abstract these to the CallAnalyzer to avoid duplication.
1186 unsigned SROACostSavingOpportunities = 0;
1187 int VectorBonus = 0;
1188 int SingleBBBonus = 0;
1189 int Threshold = 5;
1190
1192
1193 void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1194 Cost[static_cast<size_t>(Feature)] += Delta;
1195 }
1196
1197 void set(InlineCostFeatureIndex Feature, int64_t Value) {
1198 Cost[static_cast<size_t>(Feature)] = Value;
1199 }
1200
1201 void onDisableSROA(AllocaInst *Arg) override {
1202 auto CostIt = SROACosts.find(Arg);
1203 if (CostIt == SROACosts.end())
1204 return;
1205
1206 increment(InlineCostFeatureIndex::sroa_losses, CostIt->second);
1207 SROACostSavingOpportunities -= CostIt->second;
1208 SROACosts.erase(CostIt);
1209 }
1210
1211 void onDisableLoadElimination() override {
1212 set(InlineCostFeatureIndex::load_elimination, 1);
1213 }
1214
1215 void onCallPenalty() override {
1216 increment(InlineCostFeatureIndex::call_penalty, CallPenalty);
1217 }
1218
1219 void onCallArgumentSetup(const CallBase &Call) override {
1220 increment(InlineCostFeatureIndex::call_argument_setup,
1221 Call.arg_size() * InstrCost);
1222 }
1223
1224 void onLoadRelativeIntrinsic() override {
1225 increment(InlineCostFeatureIndex::load_relative_intrinsic, 3 * InstrCost);
1226 }
1227
1228 void onLoweredCall(Function *F, CallBase &Call,
1229 bool IsIndirectCall) override {
1230 increment(InlineCostFeatureIndex::lowered_call_arg_setup,
1231 Call.arg_size() * InstrCost);
1232
1233 if (IsIndirectCall) {
1234 InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1235 /*HintThreshold*/ {},
1236 /*ColdThreshold*/ {},
1237 /*OptSizeThreshold*/ {},
1238 /*OptMinSizeThreshold*/ {},
1239 /*HotCallSiteThreshold*/ {},
1240 /*LocallyHotCallSiteThreshold*/ {},
1241 /*ColdCallSiteThreshold*/ {},
1242 /*ComputeFullInlineCost*/ true,
1243 /*EnableDeferral*/ true};
1244 IndirectCallParams.DefaultThreshold =
1246
1247 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1248 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
1249 false, true);
1250 if (CA.analyze().isSuccess()) {
1251 increment(InlineCostFeatureIndex::nested_inline_cost_estimate,
1252 CA.getCost());
1253 increment(InlineCostFeatureIndex::nested_inlines, 1);
1254 }
1255 } else {
1256 onCallPenalty();
1257 }
1258 }
1259
1260 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
1261 bool DefaultDestUndefined) override {
1262 if (JumpTableSize) {
1263 if (!DefaultDestUndefined)
1264 increment(InlineCostFeatureIndex::switch_default_dest_penalty,
1265 SwitchDefaultDestCostMultiplier * InstrCost);
1266 int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1267 JTCostMultiplier * InstrCost;
1268 increment(InlineCostFeatureIndex::jump_table_penalty, JTCost);
1269 return;
1270 }
1271
1272 if (NumCaseCluster <= 3) {
1273 increment(InlineCostFeatureIndex::case_cluster_penalty,
1274 (NumCaseCluster - DefaultDestUndefined) *
1275 CaseClusterCostMultiplier * InstrCost);
1276 return;
1277 }
1278
1279 int64_t ExpectedNumberOfCompare =
1280 getExpectedNumberOfCompare(NumCaseCluster);
1281
1282 int64_t SwitchCost =
1283 ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1284 increment(InlineCostFeatureIndex::switch_penalty, SwitchCost);
1285 }
1286
1287 void onMissedSimplification() override {
1288 increment(InlineCostFeatureIndex::unsimplified_common_instructions,
1289 InstrCost);
1290 }
1291
1292 void onInitializeSROAArg(AllocaInst *Arg) override {
1293 auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
1294 SROACosts[Arg] = SROAArgCost;
1295 SROACostSavingOpportunities += SROAArgCost;
1296 }
1297
1298 void onAggregateSROAUse(AllocaInst *Arg) override {
1299 SROACosts.find(Arg)->second += InstrCost;
1300 SROACostSavingOpportunities += InstrCost;
1301 }
1302
1303 void onBlockAnalyzed(const BasicBlock *BB) override {
1304 if (BB->getTerminator()->getNumSuccessors() > 1)
1305 set(InlineCostFeatureIndex::is_multiple_blocks, 1);
1306 Threshold -= SingleBBBonus;
1307 }
1308
1309 InlineResult finalizeAnalysis() override {
1310 auto *Caller = CandidateCall.getFunction();
1311 if (Caller->hasMinSize()) {
1312 DominatorTree DT(F);
1313 LoopInfo LI(DT);
1314 for (Loop *L : LI) {
1315 // Ignore loops that will not be executed
1316 if (DeadBlocks.count(L->getHeader()))
1317 continue;
1318 increment(InlineCostFeatureIndex::num_loops,
1320 }
1321 }
1322 set(InlineCostFeatureIndex::dead_blocks, DeadBlocks.size());
1323 set(InlineCostFeatureIndex::simplified_instructions,
1324 NumInstructionsSimplified);
1325 set(InlineCostFeatureIndex::constant_args, NumConstantArgs);
1326 set(InlineCostFeatureIndex::constant_offset_ptr_args,
1327 NumConstantOffsetPtrArgs);
1328 set(InlineCostFeatureIndex::sroa_savings, SROACostSavingOpportunities);
1329
1330 if (NumVectorInstructions <= NumInstructions / 10)
1331 Threshold -= VectorBonus;
1332 else if (NumVectorInstructions <= NumInstructions / 2)
1333 Threshold -= VectorBonus / 2;
1334
1335 set(InlineCostFeatureIndex::threshold, Threshold);
1336
1337 return InlineResult::success();
1338 }
1339
1340 bool shouldStop() override { return false; }
1341
1342 void onLoadEliminationOpportunity() override {
1343 increment(InlineCostFeatureIndex::load_elimination, 1);
1344 }
1345
1346 InlineResult onAnalysisStart() override {
1347 increment(InlineCostFeatureIndex::callsite_cost,
1348 -1 * getCallsiteCost(TTI, this->CandidateCall, DL));
1349
1350 set(InlineCostFeatureIndex::cold_cc_penalty,
1351 (F.getCallingConv() == CallingConv::Cold));
1352
1353 set(InlineCostFeatureIndex::last_call_to_static_bonus,
1354 isSoleCallToLocalFunction(CandidateCall, F));
1355
1356 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1357 // analyzer - instead, we should abstract it to a common method in the
1358 // CallAnalyzer
1359 int SingleBBBonusPercent = 50;
1360 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1361 Threshold += TTI.adjustInliningThreshold(&CandidateCall);
1362 Threshold *= TTI.getInliningThresholdMultiplier();
1363 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1364 VectorBonus = Threshold * VectorBonusPercent / 100;
1365 Threshold += (SingleBBBonus + VectorBonus);
1366
1367 return InlineResult::success();
1368 }
1369
1370public:
1371 InlineCostFeaturesAnalyzer(
1372 const TargetTransformInfo &TTI,
1373 function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1375 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
1377 CallBase &Call)
1378 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI,
1379 PSI) {}
1380
1381 const InlineCostFeatures &features() const { return Cost; }
1382};
1383
1384} // namespace
1385
1386/// Test whether the given value is an Alloca-derived function argument.
1387bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1388 return SROAArgValues.count(V);
1389}
1390
1391void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1392 onDisableSROA(SROAArg);
1393 EnabledSROAAllocas.erase(SROAArg);
1394 disableLoadElimination();
1395}
1396
1397void InlineCostAnnotationWriter::emitInstructionAnnot(
1399 // The cost of inlining of the given instruction is printed always.
1400 // The threshold delta is printed only when it is non-zero. It happens
1401 // when we decided to give a bonus at a particular instruction.
1402 std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1403 if (!Record)
1404 OS << "; No analysis for the instruction";
1405 else {
1406 OS << "; cost before = " << Record->CostBefore
1407 << ", cost after = " << Record->CostAfter
1408 << ", threshold before = " << Record->ThresholdBefore
1409 << ", threshold after = " << Record->ThresholdAfter << ", ";
1410 OS << "cost delta = " << Record->getCostDelta();
1411 if (Record->hasThresholdChanged())
1412 OS << ", threshold delta = " << Record->getThresholdDelta();
1413 }
1414 auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
1415 if (C) {
1416 OS << ", simplified to ";
1417 (*C)->print(OS, true);
1418 }
1419 OS << "\n";
1420}
1421
1422/// If 'V' maps to a SROA candidate, disable SROA for it.
1423void CallAnalyzer::disableSROA(Value *V) {
1424 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1425 disableSROAForArg(SROAArg);
1426 }
1427}
1428
1429void CallAnalyzer::disableLoadElimination() {
1430 if (EnableLoadElimination) {
1431 onDisableLoadElimination();
1432 EnableLoadElimination = false;
1433 }
1434}
1435
1436/// Accumulate a constant GEP offset into an APInt if possible.
1437///
1438/// Returns false if unable to compute the offset for any reason. Respects any
1439/// simplified values known during the analysis of this callsite.
1440bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1441 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
1442 assert(IntPtrWidth == Offset.getBitWidth());
1443
1445 GTI != GTE; ++GTI) {
1446 ConstantInt *OpC =
1447 getDirectOrSimplifiedValue<ConstantInt>(GTI.getOperand());
1448 if (!OpC)
1449 return false;
1450 if (OpC->isZero())
1451 continue;
1452
1453 // Handle a struct index, which adds its field offset to the pointer.
1454 if (StructType *STy = GTI.getStructTypeOrNull()) {
1455 unsigned ElementIdx = OpC->getZExtValue();
1456 const StructLayout *SL = DL.getStructLayout(STy);
1457 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
1458 continue;
1459 }
1460
1461 APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
1462 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
1463 }
1464 return true;
1465}
1466
1467/// Use TTI to check whether a GEP is free.
1468///
1469/// Respects any simplified values known during the analysis of this callsite.
1470bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1472 Operands.push_back(GEP.getOperand(0));
1473 for (const Use &Op : GEP.indices())
1474 if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
1475 Operands.push_back(SimpleOp);
1476 else
1477 Operands.push_back(Op);
1481}
1482
1483bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1484 disableSROA(I.getOperand(0));
1485
1486 // Check whether inlining will turn a dynamic alloca into a static
1487 // alloca and handle that case.
1488 if (I.isArrayAllocation()) {
1489 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
1490 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
1491 // Sometimes a dynamic alloca could be converted into a static alloca
1492 // after this constant prop, and become a huge static alloca on an
1493 // unconditional CFG path. Avoid inlining if this is going to happen above
1494 // a threshold.
1495 // FIXME: If the threshold is removed or lowered too much, we could end up
1496 // being too pessimistic and prevent inlining non-problematic code. This
1497 // could result in unintended perf regressions. A better overall strategy
1498 // is needed to track stack usage during inlining.
1499 Type *Ty = I.getAllocatedType();
1500 AllocatedSize = SaturatingMultiplyAdd(
1501 AllocSize->getLimitedValue(),
1502 DL.getTypeAllocSize(Ty).getKnownMinValue(), AllocatedSize);
1504 HasDynamicAlloca = true;
1505 return false;
1506 }
1507 }
1508
1509 // Accumulate the allocated size.
1510 if (I.isStaticAlloca()) {
1511 Type *Ty = I.getAllocatedType();
1512 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinValue(),
1513 AllocatedSize);
1514 }
1515
1516 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1517 // a variety of reasons, and so we would like to not inline them into
1518 // functions which don't currently have a dynamic alloca. This simply
1519 // disables inlining altogether in the presence of a dynamic alloca.
1520 if (!I.isStaticAlloca())
1521 HasDynamicAlloca = true;
1522
1523 return false;
1524}
1525
1526bool CallAnalyzer::visitPHI(PHINode &I) {
1527 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1528 // though we don't want to propagate it's bonuses. The idea is to disable
1529 // SROA if it *might* be used in an inappropriate manner.
1530
1531 // Phi nodes are always zero-cost.
1532 // FIXME: Pointer sizes may differ between different address spaces, so do we
1533 // need to use correct address space in the call to getPointerSizeInBits here?
1534 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1535 // see the ZeroOffset is used as a dummy value, so we can probably use any
1536 // bit width for the ZeroOffset?
1537 APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));
1538 bool CheckSROA = I.getType()->isPointerTy();
1539
1540 // Track the constant or pointer with constant offset we've seen so far.
1541 Constant *FirstC = nullptr;
1542 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1543 Value *FirstV = nullptr;
1544
1545 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1546 BasicBlock *Pred = I.getIncomingBlock(i);
1547 // If the incoming block is dead, skip the incoming block.
1548 if (DeadBlocks.count(Pred))
1549 continue;
1550 // If the parent block of phi is not the known successor of the incoming
1551 // block, skip the incoming block.
1552 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1553 if (KnownSuccessor && KnownSuccessor != I.getParent())
1554 continue;
1555
1556 Value *V = I.getIncomingValue(i);
1557 // If the incoming value is this phi itself, skip the incoming value.
1558 if (&I == V)
1559 continue;
1560
1561 Constant *C = getDirectOrSimplifiedValue<Constant>(V);
1562
1563 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1564 if (!C && CheckSROA)
1565 BaseAndOffset = ConstantOffsetPtrs.lookup(V);
1566
1567 if (!C && !BaseAndOffset.first)
1568 // The incoming value is neither a constant nor a pointer with constant
1569 // offset, exit early.
1570 return true;
1571
1572 if (FirstC) {
1573 if (FirstC == C)
1574 // If we've seen a constant incoming value before and it is the same
1575 // constant we see this time, continue checking the next incoming value.
1576 continue;
1577 // Otherwise early exit because we either see a different constant or saw
1578 // a constant before but we have a pointer with constant offset this time.
1579 return true;
1580 }
1581
1582 if (FirstV) {
1583 // The same logic as above, but check pointer with constant offset here.
1584 if (FirstBaseAndOffset == BaseAndOffset)
1585 continue;
1586 return true;
1587 }
1588
1589 if (C) {
1590 // This is the 1st time we've seen a constant, record it.
1591 FirstC = C;
1592 continue;
1593 }
1594
1595 // The remaining case is that this is the 1st time we've seen a pointer with
1596 // constant offset, record it.
1597 FirstV = V;
1598 FirstBaseAndOffset = BaseAndOffset;
1599 }
1600
1601 // Check if we can map phi to a constant.
1602 if (FirstC) {
1603 SimplifiedValues[&I] = FirstC;
1604 return true;
1605 }
1606
1607 // Check if we can map phi to a pointer with constant offset.
1608 if (FirstBaseAndOffset.first) {
1609 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1610
1611 if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
1612 SROAArgValues[&I] = SROAArg;
1613 }
1614
1615 return true;
1616}
1617
1618/// Check we can fold GEPs of constant-offset call site argument pointers.
1619/// This requires target data and inbounds GEPs.
1620///
1621/// \return true if the specified GEP can be folded.
1622bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1623 // Check if we have a base + offset for the pointer.
1624 std::pair<Value *, APInt> BaseAndOffset =
1625 ConstantOffsetPtrs.lookup(I.getPointerOperand());
1626 if (!BaseAndOffset.first)
1627 return false;
1628
1629 // Check if the offset of this GEP is constant, and if so accumulate it
1630 // into Offset.
1631 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
1632 return false;
1633
1634 // Add the result as a new mapping to Base + Offset.
1635 ConstantOffsetPtrs[&I] = BaseAndOffset;
1636
1637 return true;
1638}
1639
1640bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1641 auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
1642
1643 // Lambda to check whether a GEP's indices are all constant.
1644 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1645 for (const Use &Op : GEP.indices())
1646 if (!getDirectOrSimplifiedValue<Constant>(Op))
1647 return false;
1648 return true;
1649 };
1650
1653 return true;
1654
1655 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1656 if (SROAArg)
1657 SROAArgValues[&I] = SROAArg;
1658
1659 // Constant GEPs are modeled as free.
1660 return true;
1661 }
1662
1663 // Variable GEPs will require math and will disable SROA.
1664 if (SROAArg)
1665 disableSROAForArg(SROAArg);
1666 return isGEPFree(I);
1667}
1668
1669/// Simplify \p I if its operands are constants and update SimplifiedValues.
1670bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1672 for (Value *Op : I.operands()) {
1673 Constant *COp = getDirectOrSimplifiedValue<Constant>(Op);
1674 if (!COp)
1675 return false;
1676 COps.push_back(COp);
1677 }
1678 auto *C = ConstantFoldInstOperands(&I, COps, DL);
1679 if (!C)
1680 return false;
1681 SimplifiedValues[&I] = C;
1682 return true;
1683}
1684
1685/// Try to simplify a call to llvm.is.constant.
1686///
1687/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1688/// we expect calls of this specific intrinsic to be infrequent.
1689///
1690/// FIXME: Given that we know CB's parent (F) caller
1691/// (CandidateCall->getParent()->getParent()), we might be able to determine
1692/// whether inlining F into F's caller would change how the call to
1693/// llvm.is.constant would evaluate.
1694bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1695 Value *Arg = CB.getArgOperand(0);
1696 auto *C = getDirectOrSimplifiedValue<Constant>(Arg);
1697
1698 Type *RT = CB.getFunctionType()->getReturnType();
1699 SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);
1700 return true;
1701}
1702
1703bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1704 // As per the langref, "The fourth argument to llvm.objectsize determines if
1705 // the value should be evaluated at runtime."
1706 if (cast<ConstantInt>(CB.getArgOperand(3))->isOne())
1707 return false;
1708
1709 Value *V = lowerObjectSizeCall(&cast<IntrinsicInst>(CB), DL, nullptr,
1710 /*MustSucceed=*/true);
1711 Constant *C = dyn_cast_or_null<Constant>(V);
1712 if (C)
1713 SimplifiedValues[&CB] = C;
1714 return C;
1715}
1716
1717bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1718 // Propagate constants through bitcasts.
1720 return true;
1721
1722 // Track base/offsets through casts
1723 std::pair<Value *, APInt> BaseAndOffset =
1724 ConstantOffsetPtrs.lookup(I.getOperand(0));
1725 // Casts don't change the offset, just wrap it up.
1726 if (BaseAndOffset.first)
1727 ConstantOffsetPtrs[&I] = BaseAndOffset;
1728
1729 // Also look for SROA candidates here.
1730 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1731 SROAArgValues[&I] = SROAArg;
1732
1733 // Bitcasts are always zero cost.
1734 return true;
1735}
1736
1737bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1738 // Propagate constants through ptrtoint.
1740 return true;
1741
1742 // Track base/offset pairs when converted to a plain integer provided the
1743 // integer is large enough to represent the pointer.
1744 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1745 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1746 if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1747 std::pair<Value *, APInt> BaseAndOffset =
1748 ConstantOffsetPtrs.lookup(I.getOperand(0));
1749 if (BaseAndOffset.first)
1750 ConstantOffsetPtrs[&I] = BaseAndOffset;
1751 }
1752
1753 // This is really weird. Technically, ptrtoint will disable SROA. However,
1754 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1755 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1756 // would block SROA would also block SROA if applied directly to a pointer,
1757 // and so we can just add the integer in here. The only places where SROA is
1758 // preserved either cannot fire on an integer, or won't in-and-of themselves
1759 // disable SROA (ext) w/o some later use that we would see and disable.
1760 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1761 SROAArgValues[&I] = SROAArg;
1762
1765}
1766
1767bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1768 // Propagate constants through ptrtoint.
1770 return true;
1771
1772 // Track base/offset pairs when round-tripped through a pointer without
1773 // modifications provided the integer is not too large.
1774 Value *Op = I.getOperand(0);
1775 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1776 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1777 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1778 if (BaseAndOffset.first)
1779 ConstantOffsetPtrs[&I] = BaseAndOffset;
1780 }
1781
1782 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1783 if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1784 SROAArgValues[&I] = SROAArg;
1785
1788}
1789
1790bool CallAnalyzer::visitCastInst(CastInst &I) {
1791 // Propagate constants through casts.
1793 return true;
1794
1795 // Disable SROA in the face of arbitrary casts we don't explicitly list
1796 // elsewhere.
1797 disableSROA(I.getOperand(0));
1798
1799 // If this is a floating-point cast, and the target says this operation
1800 // is expensive, this may eventually become a library call. Treat the cost
1801 // as such.
1802 switch (I.getOpcode()) {
1803 case Instruction::FPTrunc:
1804 case Instruction::FPExt:
1805 case Instruction::UIToFP:
1806 case Instruction::SIToFP:
1807 case Instruction::FPToUI:
1808 case Instruction::FPToSI:
1810 onCallPenalty();
1811 break;
1812 default:
1813 break;
1814 }
1815
1818}
1819
1820bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1821 return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1822}
1823
1824bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1825 // Does the *call site* have the NonNull attribute set on an argument? We
1826 // use the attribute on the call site to memoize any analysis done in the
1827 // caller. This will also trip if the callee function has a non-null
1828 // parameter attribute, but that's a less interesting case because hopefully
1829 // the callee would already have been simplified based on that.
1830 if (Argument *A = dyn_cast<Argument>(V))
1831 if (paramHasAttr(A, Attribute::NonNull))
1832 return true;
1833
1834 // Is this an alloca in the caller? This is distinct from the attribute case
1835 // above because attributes aren't updated within the inliner itself and we
1836 // always want to catch the alloca derived case.
1837 if (isAllocaDerivedArg(V))
1838 // We can actually predict the result of comparisons between an
1839 // alloca-derived value and null. Note that this fires regardless of
1840 // SROA firing.
1841 return true;
1842
1843 return false;
1844}
1845
1846bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1847 // If the normal destination of the invoke or the parent block of the call
1848 // site is unreachable-terminated, there is little point in inlining this
1849 // unless there is literally zero cost.
1850 // FIXME: Note that it is possible that an unreachable-terminated block has a
1851 // hot entry. For example, in below scenario inlining hot_call_X() may be
1852 // beneficial :
1853 // main() {
1854 // hot_call_1();
1855 // ...
1856 // hot_call_N()
1857 // exit(0);
1858 // }
1859 // For now, we are not handling this corner case here as it is rare in real
1860 // code. In future, we should elaborate this based on BPI and BFI in more
1861 // general threshold adjusting heuristics in updateThreshold().
1862 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
1863 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
1864 return false;
1865 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
1866 return false;
1867
1868 return true;
1869}
1870
1871bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1872 BlockFrequencyInfo *CallerBFI) {
1873 // If global profile summary is available, then callsite's coldness is
1874 // determined based on that.
1875 if (PSI && PSI->hasProfileSummary())
1876 return PSI->isColdCallSite(Call, CallerBFI);
1877
1878 // Otherwise we need BFI to be available.
1879 if (!CallerBFI)
1880 return false;
1881
1882 // Determine if the callsite is cold relative to caller's entry. We could
1883 // potentially cache the computation of scaled entry frequency, but the added
1884 // complexity is not worth it unless this scaling shows up high in the
1885 // profiles.
1886 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1887 auto CallSiteBB = Call.getParent();
1888 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1889 auto CallerEntryFreq =
1890 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
1891 return CallSiteFreq < CallerEntryFreq * ColdProb;
1892}
1893
1894std::optional<int>
1895InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1896 BlockFrequencyInfo *CallerBFI) {
1897
1898 // If global profile summary is available, then callsite's hotness is
1899 // determined based on that.
1900 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
1901 return Params.HotCallSiteThreshold;
1902
1903 // Otherwise we need BFI to be available and to have a locally hot callsite
1904 // threshold.
1905 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1906 return std::nullopt;
1907
1908 // Determine if the callsite is hot relative to caller's entry. We could
1909 // potentially cache the computation of scaled entry frequency, but the added
1910 // complexity is not worth it unless this scaling shows up high in the
1911 // profiles.
1912 const BasicBlock *CallSiteBB = Call.getParent();
1913 BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1914 BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
1915 std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(HotCallSiteRelFreq);
1916 if (Limit && CallSiteFreq >= *Limit)
1917 return Params.LocallyHotCallSiteThreshold;
1918
1919 // Otherwise treat it normally.
1920 return std::nullopt;
1921}
1922
1923void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1924 // If no size growth is allowed for this inlining, set Threshold to 0.
1925 if (!allowSizeGrowth(Call)) {
1926 Threshold = 0;
1927 return;
1928 }
1929
1930 Function *Caller = Call.getCaller();
1931
1932 // return min(A, B) if B is valid.
1933 auto MinIfValid = [](int A, std::optional<int> B) {
1934 return B ? std::min(A, *B) : A;
1935 };
1936
1937 // return max(A, B) if B is valid.
1938 auto MaxIfValid = [](int A, std::optional<int> B) {
1939 return B ? std::max(A, *B) : A;
1940 };
1941
1942 // Various bonus percentages. These are multiplied by Threshold to get the
1943 // bonus values.
1944 // SingleBBBonus: This bonus is applied if the callee has a single reachable
1945 // basic block at the given callsite context. This is speculatively applied
1946 // and withdrawn if more than one basic block is seen.
1947 //
1948 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1949 // of the last call to a static function as inlining such functions is
1950 // guaranteed to reduce code size.
1951 //
1952 // These bonus percentages may be set to 0 based on properties of the caller
1953 // and the callsite.
1954 int SingleBBBonusPercent = 50;
1955 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1956 int LastCallToStaticBonus = TTI.getInliningLastCallToStaticBonus();
1957
1958 // Lambda to set all the above bonus and bonus percentages to 0.
1959 auto DisallowAllBonuses = [&]() {
1960 SingleBBBonusPercent = 0;
1961 VectorBonusPercent = 0;
1962 LastCallToStaticBonus = 0;
1963 };
1964
1965 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1966 // and reduce the threshold if the caller has the necessary attribute.
1967 if (Caller->hasMinSize()) {
1968 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1969 // For minsize, we want to disable the single BB bonus and the vector
1970 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1971 // a static function will, at the minimum, eliminate the parameter setup and
1972 // call/return instructions.
1973 SingleBBBonusPercent = 0;
1974 VectorBonusPercent = 0;
1975 } else if (Caller->hasOptSize())
1976 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1977
1978 // Adjust the threshold based on inlinehint attribute and profile based
1979 // hotness information if the caller does not have MinSize attribute.
1980 if (!Caller->hasMinSize()) {
1981 if (Callee.hasFnAttribute(Attribute::InlineHint))
1982 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1983
1984 // FIXME: After switching to the new passmanager, simplify the logic below
1985 // by checking only the callsite hotness/coldness as we will reliably
1986 // have local profile information.
1987 //
1988 // Callsite hotness and coldness can be determined if sample profile is
1989 // used (which adds hotness metadata to calls) or if caller's
1990 // BlockFrequencyInfo is available.
1991 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
1992 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1993 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1994 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1995 // FIXME: This should update the threshold only if it exceeds the
1996 // current threshold, but AutoFDO + ThinLTO currently relies on this
1997 // behavior to prevent inlining of hot callsites during ThinLTO
1998 // compile phase.
1999 Threshold = *HotCallSiteThreshold;
2000 } else if (isColdCallSite(Call, CallerBFI)) {
2001 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
2002 // Do not apply bonuses for a cold callsite including the
2003 // LastCallToStatic bonus. While this bonus might result in code size
2004 // reduction, it can cause the size of a non-cold caller to increase
2005 // preventing it from being inlined.
2006 DisallowAllBonuses();
2007 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
2008 } else if (PSI) {
2009 // Use callee's global profile information only if we have no way of
2010 // determining this via callsite information.
2011 if (PSI->isFunctionEntryHot(&Callee)) {
2012 LLVM_DEBUG(dbgs() << "Hot callee.\n");
2013 // If callsite hotness can not be determined, we may still know
2014 // that the callee is hot and treat it as a weaker hint for threshold
2015 // increase.
2016 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2017 } else if (PSI->isFunctionEntryCold(&Callee)) {
2018 LLVM_DEBUG(dbgs() << "Cold callee.\n");
2019 // Do not apply bonuses for a cold callee including the
2020 // LastCallToStatic bonus. While this bonus might result in code size
2021 // reduction, it can cause the size of a non-cold caller to increase
2022 // preventing it from being inlined.
2023 DisallowAllBonuses();
2024 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2025 }
2026 }
2027 }
2028
2029 Threshold += TTI.adjustInliningThreshold(&Call);
2030
2031 // Finally, take the target-specific inlining threshold multiplier into
2032 // account.
2033 Threshold *= TTI.getInliningThresholdMultiplier();
2034
2035 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2036 VectorBonus = Threshold * VectorBonusPercent / 100;
2037
2038 // If there is only one call of the function, and it has internal linkage,
2039 // the cost of inlining it drops dramatically. It may seem odd to update
2040 // Cost in updateThreshold, but the bonus depends on the logic in this method.
2041 if (isSoleCallToLocalFunction(Call, F)) {
2042 Cost -= LastCallToStaticBonus;
2043 StaticBonusApplied = LastCallToStaticBonus;
2044 }
2045}
2046
2047bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2048 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2049 // First try to handle simplified comparisons.
2051 return true;
2052
2053 if (I.getOpcode() == Instruction::FCmp)
2054 return false;
2055
2056 // Otherwise look for a comparison between constant offset pointers with
2057 // a common base.
2058 Value *LHSBase, *RHSBase;
2059 APInt LHSOffset, RHSOffset;
2060 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2061 if (LHSBase) {
2062 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2063 if (RHSBase && LHSBase == RHSBase) {
2064 // We have common bases, fold the icmp to a constant based on the
2065 // offsets.
2066 SimplifiedValues[&I] = ConstantInt::getBool(
2067 I.getType(),
2068 ICmpInst::compare(LHSOffset, RHSOffset, I.getPredicate()));
2069 ++NumConstantPtrCmps;
2070 return true;
2071 }
2072 }
2073
2074 auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2075 for (auto *User : I.users())
2076 if (auto *Instr = dyn_cast<Instruction>(User))
2077 if (!Instr->getMetadata(LLVMContext::MD_make_implicit))
2078 return false;
2079 return true;
2080 };
2081
2082 // If the comparison is an equality comparison with null, we can simplify it
2083 // if we know the value (argument) can't be null
2084 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1))) {
2085 if (isKnownNonNullInCallee(I.getOperand(0))) {
2086 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2087 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
2089 return true;
2090 }
2091 // Implicit null checks act as unconditional branches and their comparisons
2092 // should be treated as simplified and free of cost.
2093 if (isImplicitNullCheckCmp(I))
2094 return true;
2095 }
2096 return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
2097}
2098
2099bool CallAnalyzer::visitSub(BinaryOperator &I) {
2100 // Try to handle a special case: we can fold computing the difference of two
2101 // constant-related pointers.
2102 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2103 Value *LHSBase, *RHSBase;
2104 APInt LHSOffset, RHSOffset;
2105 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2106 if (LHSBase) {
2107 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2108 if (RHSBase && LHSBase == RHSBase) {
2109 // We have common bases, fold the subtract to a constant based on the
2110 // offsets.
2111 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2112 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2113 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
2114 SimplifiedValues[&I] = C;
2115 ++NumConstantPtrDiffs;
2116 return true;
2117 }
2118 }
2119 }
2120
2121 // Otherwise, fall back to the generic logic for simplifying and handling
2122 // instructions.
2123 return Base::visitSub(I);
2124}
2125
2126bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2127 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2128 Constant *CLHS = getDirectOrSimplifiedValue<Constant>(LHS);
2129 Constant *CRHS = getDirectOrSimplifiedValue<Constant>(RHS);
2130
2131 Value *SimpleV = nullptr;
2132 if (auto FI = dyn_cast<FPMathOperator>(&I))
2133 SimpleV = simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
2134 FI->getFastMathFlags(), DL);
2135 else
2136 SimpleV =
2137 simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
2138
2139 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2140 SimplifiedValues[&I] = C;
2141
2142 if (SimpleV)
2143 return true;
2144
2145 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2146 disableSROA(LHS);
2147 disableSROA(RHS);
2148
2149 // If the instruction is floating point, and the target says this operation
2150 // is expensive, this may eventually become a library call. Treat the cost
2151 // as such. Unless it's fneg which can be implemented with an xor.
2152 using namespace llvm::PatternMatch;
2153 if (I.getType()->isFloatingPointTy() &&
2155 !match(&I, m_FNeg(m_Value())))
2156 onCallPenalty();
2157
2158 return false;
2159}
2160
2161bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2162 Value *Op = I.getOperand(0);
2163 Constant *COp = getDirectOrSimplifiedValue<Constant>(Op);
2164
2165 Value *SimpleV = simplifyFNegInst(
2166 COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
2167
2168 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2169 SimplifiedValues[&I] = C;
2170
2171 if (SimpleV)
2172 return true;
2173
2174 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2175 disableSROA(Op);
2176
2177 return false;
2178}
2179
2180bool CallAnalyzer::visitLoad(LoadInst &I) {
2181 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2182 return true;
2183
2184 // If the data is already loaded from this address and hasn't been clobbered
2185 // by any stores or calls, this load is likely to be redundant and can be
2186 // eliminated.
2187 if (EnableLoadElimination &&
2188 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
2189 onLoadEliminationOpportunity();
2190 return true;
2191 }
2192
2193 onMemAccess();
2194 return false;
2195}
2196
2197bool CallAnalyzer::visitStore(StoreInst &I) {
2198 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2199 return true;
2200
2201 // The store can potentially clobber loads and prevent repeated loads from
2202 // being eliminated.
2203 // FIXME:
2204 // 1. We can probably keep an initial set of eliminatable loads substracted
2205 // from the cost even when we finally see a store. We just need to disable
2206 // *further* accumulation of elimination savings.
2207 // 2. We should probably at some point thread MemorySSA for the callee into
2208 // this and then use that to actually compute *really* precise savings.
2209 disableLoadElimination();
2210
2211 onMemAccess();
2212 return false;
2213}
2214
2215bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2216 // Constant folding for extract value is trivial.
2218 return true;
2219
2220 // SROA can't look through these, but they may be free.
2221 return Base::visitExtractValue(I);
2222}
2223
2224bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2225 // Constant folding for insert value is trivial.
2227 return true;
2228
2229 // SROA can't look through these, but they may be free.
2230 return Base::visitInsertValue(I);
2231}
2232
2233/// Try to simplify a call site.
2234///
2235/// Takes a concrete function and callsite and tries to actually simplify it by
2236/// analyzing the arguments and call itself with instsimplify. Returns true if
2237/// it has simplified the callsite to some other entity (a constant), making it
2238/// free.
2239bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2240 // FIXME: Using the instsimplify logic directly for this is inefficient
2241 // because we have to continually rebuild the argument list even when no
2242 // simplifications can be performed. Until that is fixed with remapping
2243 // inside of instsimplify, directly constant fold calls here.
2244 if (!canConstantFoldCallTo(&Call, F))
2245 return false;
2246
2247 // Try to re-map the arguments to constants.
2248 SmallVector<Constant *, 4> ConstantArgs;
2249 ConstantArgs.reserve(Call.arg_size());
2250 for (Value *I : Call.args()) {
2251 Constant *C = getDirectOrSimplifiedValue<Constant>(I);
2252 if (!C)
2253 return false; // This argument doesn't map to a constant.
2254
2255 ConstantArgs.push_back(C);
2256 }
2257 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
2258 SimplifiedValues[&Call] = C;
2259 return true;
2260 }
2261
2262 return false;
2263}
2264
2265bool CallAnalyzer::isLoweredToCall(Function *F, CallBase &Call) {
2266 const TargetLibraryInfo *TLI = GetTLI ? &GetTLI(*F) : nullptr;
2267 LibFunc LF;
2268 if (!TLI || !TLI->getLibFunc(*F, LF) || !TLI->has(LF))
2269 return TTI.isLoweredToCall(F);
2270
2271 switch (LF) {
2272 case LibFunc_memcpy_chk:
2273 case LibFunc_memmove_chk:
2274 case LibFunc_mempcpy_chk:
2275 case LibFunc_memset_chk: {
2276 // Calls to __memcpy_chk whose length is known to fit within the object
2277 // size will eventually be replaced by inline stores. Therefore, these
2278 // should not incur a call penalty. This is only really relevant on
2279 // platforms whose headers redirect memcpy to __memcpy_chk (e.g. Darwin), as
2280 // other platforms use memcpy intrinsics, which are already exempt from the
2281 // call penalty.
2282 auto *LenOp = getDirectOrSimplifiedValue<ConstantInt>(Call.getOperand(2));
2283 auto *ObjSizeOp =
2284 getDirectOrSimplifiedValue<ConstantInt>(Call.getOperand(3));
2285 if (LenOp && ObjSizeOp &&
2286 LenOp->getLimitedValue() <= ObjSizeOp->getLimitedValue()) {
2287 return false;
2288 }
2289 break;
2290 }
2291 default:
2292 break;
2293 }
2294
2295 return TTI.isLoweredToCall(F);
2296}
2297
2298bool CallAnalyzer::visitCallBase(CallBase &Call) {
2299 if (!onCallBaseVisitStart(Call))
2300 return true;
2301
2302 if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
2303 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
2304 // This aborts the entire analysis.
2305 ExposesReturnsTwice = true;
2306 return false;
2307 }
2308 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
2309 ContainsNoDuplicateCall = true;
2310
2311 Function *F = Call.getCalledFunction();
2312 bool IsIndirectCall = !F;
2313 if (IsIndirectCall) {
2314 // Check if this happens to be an indirect function call to a known function
2315 // in this inline context. If not, we've done all we can.
2316 Value *Callee = Call.getCalledOperand();
2317 F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
2318 if (!F || F->getFunctionType() != Call.getFunctionType()) {
2319 onCallArgumentSetup(Call);
2320
2321 if (!Call.onlyReadsMemory())
2322 disableLoadElimination();
2323 return Base::visitCallBase(Call);
2324 }
2325 }
2326
2327 assert(F && "Expected a call to a known function");
2328
2329 // When we have a concrete function, first try to simplify it directly.
2330 if (simplifyCallSite(F, Call))
2331 return true;
2332
2333 // Next check if it is an intrinsic we know about.
2334 // FIXME: Lift this into part of the InstVisitor.
2335 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
2336 switch (II->getIntrinsicID()) {
2337 default:
2338 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
2339 disableLoadElimination();
2340 return Base::visitCallBase(Call);
2341
2342 case Intrinsic::load_relative:
2343 onLoadRelativeIntrinsic();
2344 return false;
2345
2346 case Intrinsic::memset:
2347 case Intrinsic::memcpy:
2348 case Intrinsic::memmove:
2349 disableLoadElimination();
2350 // SROA can usually chew through these intrinsics, but they aren't free.
2351 return false;
2352 case Intrinsic::icall_branch_funnel:
2353 case Intrinsic::localescape:
2354 HasUninlineableIntrinsic = true;
2355 return false;
2356 case Intrinsic::vastart:
2357 InitsVargArgs = true;
2358 return false;
2359 case Intrinsic::launder_invariant_group:
2360 case Intrinsic::strip_invariant_group:
2361 if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
2362 SROAArgValues[II] = SROAArg;
2363 return true;
2364 case Intrinsic::is_constant:
2365 return simplifyIntrinsicCallIsConstant(Call);
2366 case Intrinsic::objectsize:
2367 return simplifyIntrinsicCallObjectSize(Call);
2368 }
2369 }
2370
2371 if (F == Call.getFunction()) {
2372 // This flag will fully abort the analysis, so don't bother with anything
2373 // else.
2374 IsRecursiveCall = true;
2375 if (!AllowRecursiveCall)
2376 return false;
2377 }
2378
2379 if (isLoweredToCall(F, Call)) {
2380 onLoweredCall(F, Call, IsIndirectCall);
2381 }
2382
2383 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2384 disableLoadElimination();
2385 return Base::visitCallBase(Call);
2386}
2387
2388bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2389 // At least one return instruction will be free after inlining.
2390 bool Free = !HasReturn;
2391 HasReturn = true;
2392 return Free;
2393}
2394
2395bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2396 // We model unconditional branches as essentially free -- they really
2397 // shouldn't exist at all, but handling them makes the behavior of the
2398 // inliner more regular and predictable. Interestingly, conditional branches
2399 // which will fold away are also free.
2400 return BI.isUnconditional() ||
2401 getDirectOrSimplifiedValue<ConstantInt>(BI.getCondition()) ||
2402 BI.getMetadata(LLVMContext::MD_make_implicit);
2403}
2404
2405bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2406 bool CheckSROA = SI.getType()->isPointerTy();
2407 Value *TrueVal = SI.getTrueValue();
2408 Value *FalseVal = SI.getFalseValue();
2409
2410 Constant *TrueC = getDirectOrSimplifiedValue<Constant>(TrueVal);
2411 Constant *FalseC = getDirectOrSimplifiedValue<Constant>(FalseVal);
2412 Constant *CondC =
2413 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
2414
2415 if (!CondC) {
2416 // Select C, X, X => X
2417 if (TrueC == FalseC && TrueC) {
2418 SimplifiedValues[&SI] = TrueC;
2419 return true;
2420 }
2421
2422 if (!CheckSROA)
2423 return Base::visitSelectInst(SI);
2424
2425 std::pair<Value *, APInt> TrueBaseAndOffset =
2426 ConstantOffsetPtrs.lookup(TrueVal);
2427 std::pair<Value *, APInt> FalseBaseAndOffset =
2428 ConstantOffsetPtrs.lookup(FalseVal);
2429 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2430 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2431
2432 if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
2433 SROAArgValues[&SI] = SROAArg;
2434 return true;
2435 }
2436
2437 return Base::visitSelectInst(SI);
2438 }
2439
2440 // Select condition is a constant.
2441 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2442 : (CondC->isNullValue()) ? FalseVal
2443 : nullptr;
2444 if (!SelectedV) {
2445 // Condition is a vector constant that is not all 1s or all 0s. If all
2446 // operands are constants, ConstantFoldSelectInstruction() can handle the
2447 // cases such as select vectors.
2448 if (TrueC && FalseC) {
2449 if (auto *C = ConstantFoldSelectInstruction(CondC, TrueC, FalseC)) {
2450 SimplifiedValues[&SI] = C;
2451 return true;
2452 }
2453 }
2454 return Base::visitSelectInst(SI);
2455 }
2456
2457 // Condition is either all 1s or all 0s. SI can be simplified.
2458 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
2459 SimplifiedValues[&SI] = SelectedC;
2460 return true;
2461 }
2462
2463 if (!CheckSROA)
2464 return true;
2465
2466 std::pair<Value *, APInt> BaseAndOffset =
2467 ConstantOffsetPtrs.lookup(SelectedV);
2468 if (BaseAndOffset.first) {
2469 ConstantOffsetPtrs[&SI] = BaseAndOffset;
2470
2471 if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
2472 SROAArgValues[&SI] = SROAArg;
2473 }
2474
2475 return true;
2476}
2477
2478bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2479 // We model unconditional switches as free, see the comments on handling
2480 // branches.
2481 if (getDirectOrSimplifiedValue<ConstantInt>(SI.getCondition()))
2482 return true;
2483
2484 // Assume the most general case where the switch is lowered into
2485 // either a jump table, bit test, or a balanced binary tree consisting of
2486 // case clusters without merging adjacent clusters with the same
2487 // destination. We do not consider the switches that are lowered with a mix
2488 // of jump table/bit test/binary search tree. The cost of the switch is
2489 // proportional to the size of the tree or the size of jump table range.
2490 //
2491 // NB: We convert large switches which are just used to initialize large phi
2492 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2493 // inlining those. It will prevent inlining in cases where the optimization
2494 // does not (yet) fire.
2495
2496 unsigned JumpTableSize = 0;
2497 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2498 unsigned NumCaseCluster =
2499 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
2500
2501 onFinalizeSwitch(JumpTableSize, NumCaseCluster, SI.defaultDestUndefined());
2502 return false;
2503}
2504
2505bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2506 // We never want to inline functions that contain an indirectbr. This is
2507 // incorrect because all the blockaddress's (in static global initializers
2508 // for example) would be referring to the original function, and this
2509 // indirect jump would jump from the inlined copy of the function into the
2510 // original function which is extremely undefined behavior.
2511 // FIXME: This logic isn't really right; we can safely inline functions with
2512 // indirectbr's as long as no other function or global references the
2513 // blockaddress of a block within the current function.
2514 HasIndirectBr = true;
2515 return false;
2516}
2517
2518bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2519 // FIXME: It's not clear that a single instruction is an accurate model for
2520 // the inline cost of a resume instruction.
2521 return false;
2522}
2523
2524bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2525 // FIXME: It's not clear that a single instruction is an accurate model for
2526 // the inline cost of a cleanupret instruction.
2527 return false;
2528}
2529
2530bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2531 // FIXME: It's not clear that a single instruction is an accurate model for
2532 // the inline cost of a catchret instruction.
2533 return false;
2534}
2535
2536bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2537 // FIXME: It might be reasonably to discount the cost of instructions leading
2538 // to unreachable as they have the lowest possible impact on both runtime and
2539 // code size.
2540 return true; // No actual code is needed for unreachable.
2541}
2542
2543bool CallAnalyzer::visitInstruction(Instruction &I) {
2544 // Some instructions are free. All of the free intrinsics can also be
2545 // handled by SROA, etc.
2548 return true;
2549
2550 // We found something we don't understand or can't handle. Mark any SROA-able
2551 // values in the operand list as no longer viable.
2552 for (const Use &Op : I.operands())
2553 disableSROA(Op);
2554
2555 return false;
2556}
2557
2558/// Analyze a basic block for its contribution to the inline cost.
2559///
2560/// This method walks the analyzer over every instruction in the given basic
2561/// block and accounts for their cost during inlining at this callsite. It
2562/// aborts early if the threshold has been exceeded or an impossible to inline
2563/// construct has been detected. It returns false if inlining is no longer
2564/// viable, and true if inlining remains viable.
2566CallAnalyzer::analyzeBlock(BasicBlock *BB,
2567 SmallPtrSetImpl<const Value *> &EphValues) {
2568 for (Instruction &I : *BB) {
2569 // FIXME: Currently, the number of instructions in a function regardless of
2570 // our ability to simplify them during inline to constants or dead code,
2571 // are actually used by the vector bonus heuristic. As long as that's true,
2572 // we have to special case debug intrinsics here to prevent differences in
2573 // inlining due to debug symbols. Eventually, the number of unsimplified
2574 // instructions shouldn't factor into the cost computation, but until then,
2575 // hack around it here.
2576 // Similarly, skip pseudo-probes.
2577 if (I.isDebugOrPseudoInst())
2578 continue;
2579
2580 // Skip ephemeral values.
2581 if (EphValues.count(&I))
2582 continue;
2583
2584 ++NumInstructions;
2585 if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
2586 ++NumVectorInstructions;
2587
2588 // If the instruction simplified to a constant, there is no cost to this
2589 // instruction. Visit the instructions using our InstVisitor to account for
2590 // all of the per-instruction logic. The visit tree returns true if we
2591 // consumed the instruction in any way, and false if the instruction's base
2592 // cost should count against inlining.
2593 onInstructionAnalysisStart(&I);
2594
2595 if (Base::visit(&I))
2596 ++NumInstructionsSimplified;
2597 else
2598 onMissedSimplification();
2599
2600 onInstructionAnalysisFinish(&I);
2601 using namespace ore;
2602 // If the visit this instruction detected an uninlinable pattern, abort.
2604 if (IsRecursiveCall && !AllowRecursiveCall)
2605 IR = InlineResult::failure("recursive");
2606 else if (ExposesReturnsTwice)
2607 IR = InlineResult::failure("exposes returns twice");
2608 else if (HasDynamicAlloca)
2609 IR = InlineResult::failure("dynamic alloca");
2610 else if (HasIndirectBr)
2611 IR = InlineResult::failure("indirect branch");
2612 else if (HasUninlineableIntrinsic)
2613 IR = InlineResult::failure("uninlinable intrinsic");
2614 else if (InitsVargArgs)
2615 IR = InlineResult::failure("varargs");
2616 if (!IR.isSuccess()) {
2617 if (ORE)
2618 ORE->emit([&]() {
2619 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2620 &CandidateCall)
2621 << NV("Callee", &F) << " has uninlinable pattern ("
2622 << NV("InlineResult", IR.getFailureReason())
2623 << ") and cost is not fully computed";
2624 });
2625 return IR;
2626 }
2627
2628 // If the caller is a recursive function then we don't want to inline
2629 // functions which allocate a lot of stack space because it would increase
2630 // the caller stack usage dramatically.
2631 if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2632 auto IR =
2633 InlineResult::failure("recursive and allocates too much stack space");
2634 if (ORE)
2635 ORE->emit([&]() {
2636 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2637 &CandidateCall)
2638 << NV("Callee", &F) << " is "
2639 << NV("InlineResult", IR.getFailureReason())
2640 << ". Cost is not fully computed";
2641 });
2642 return IR;
2643 }
2644
2645 if (shouldStop())
2646 return InlineResult::failure(
2647 "Call site analysis is not favorable to inlining.");
2648 }
2649
2650 return InlineResult::success();
2651}
2652
2653/// Compute the base pointer and cumulative constant offsets for V.
2654///
2655/// This strips all constant offsets off of V, leaving it the base pointer, and
2656/// accumulates the total constant offset applied in the returned constant. It
2657/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2658/// no constant offsets applied.
2659ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2660 if (!V->getType()->isPointerTy())
2661 return nullptr;
2662
2663 unsigned AS = V->getType()->getPointerAddressSpace();
2664 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2665 APInt Offset = APInt::getZero(IntPtrWidth);
2666
2667 // Even though we don't look through PHI nodes, we could be called on an
2668 // instruction in an unreachable block, which may be on a cycle.
2670 Visited.insert(V);
2671 do {
2672 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
2673 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
2674 return nullptr;
2675 V = GEP->getPointerOperand();
2676 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
2677 if (GA->isInterposable())
2678 break;
2679 V = GA->getAliasee();
2680 } else {
2681 break;
2682 }
2683 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2684 } while (Visited.insert(V).second);
2685
2686 Type *IdxPtrTy = DL.getIndexType(V->getType());
2687 return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
2688}
2689
2690/// Find dead blocks due to deleted CFG edges during inlining.
2691///
2692/// If we know the successor of the current block, \p CurrBB, has to be \p
2693/// NextBB, the other successors of \p CurrBB are dead if these successors have
2694/// no live incoming CFG edges. If one block is found to be dead, we can
2695/// continue growing the dead block list by checking the successors of the dead
2696/// blocks to see if all their incoming edges are dead or not.
2697void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2698 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2699 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2700 // known successor which is not the one under exam.
2701 return (DeadBlocks.count(Pred) ||
2702 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
2703 };
2704
2705 auto IsNewlyDead = [&](BasicBlock *BB) {
2706 // If all the edges to a block are dead, the block is also dead.
2707 return (!DeadBlocks.count(BB) &&
2709 [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2710 };
2711
2712 for (BasicBlock *Succ : successors(CurrBB)) {
2713 if (Succ == NextBB || !IsNewlyDead(Succ))
2714 continue;
2716 NewDead.push_back(Succ);
2717 while (!NewDead.empty()) {
2718 BasicBlock *Dead = NewDead.pop_back_val();
2719 if (DeadBlocks.insert(Dead).second)
2720 // Continue growing the dead block lists.
2721 for (BasicBlock *S : successors(Dead))
2722 if (IsNewlyDead(S))
2723 NewDead.push_back(S);
2724 }
2725 }
2726}
2727
2728/// Analyze a call site for potential inlining.
2729///
2730/// Returns true if inlining this call is viable, and false if it is not
2731/// viable. It computes the cost and adjusts the threshold based on numerous
2732/// factors and heuristics. If this method returns false but the computed cost
2733/// is below the computed threshold, then inlining was forcibly disabled by
2734/// some artifact of the routine.
2735InlineResult CallAnalyzer::analyze() {
2736 ++NumCallsAnalyzed;
2737
2738 auto Result = onAnalysisStart();
2739 if (!Result.isSuccess())
2740 return Result;
2741
2742 if (F.empty())
2743 return InlineResult::success();
2744
2745 Function *Caller = CandidateCall.getFunction();
2746 // Check if the caller function is recursive itself.
2747 for (User *U : Caller->users()) {
2748 CallBase *Call = dyn_cast<CallBase>(U);
2749 if (Call && Call->getFunction() == Caller) {
2750 IsCallerRecursive = true;
2751 break;
2752 }
2753 }
2754
2755 // Populate our simplified values by mapping from function arguments to call
2756 // arguments with known important simplifications.
2757 auto CAI = CandidateCall.arg_begin();
2758 for (Argument &FAI : F.args()) {
2759 assert(CAI != CandidateCall.arg_end());
2760 if (Constant *C = dyn_cast<Constant>(CAI))
2761 SimplifiedValues[&FAI] = C;
2762
2763 Value *PtrArg = *CAI;
2764 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2765 ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
2766
2767 // We can SROA any pointer arguments derived from alloca instructions.
2768 if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2769 SROAArgValues[&FAI] = SROAArg;
2770 onInitializeSROAArg(SROAArg);
2771 EnabledSROAAllocas.insert(SROAArg);
2772 }
2773 }
2774 ++CAI;
2775 }
2776 NumConstantArgs = SimplifiedValues.size();
2777 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2778 NumAllocaArgs = SROAArgValues.size();
2779
2780 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
2781 // the ephemeral values multiple times (and they're completely determined by
2782 // the callee, so this is purely duplicate work).
2784 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
2785
2786 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2787 // adding basic blocks of the callee which can be proven to be dead for this
2788 // particular call site in order to get more accurate cost estimates. This
2789 // requires a somewhat heavyweight iteration pattern: we need to walk the
2790 // basic blocks in a breadth-first order as we insert live successors. To
2791 // accomplish this, prioritizing for small iterations because we exit after
2792 // crossing our threshold, we use a small-size optimized SetVector.
2793 typedef SmallSetVector<BasicBlock *, 16> BBSetVector;
2794 BBSetVector BBWorklist;
2795 BBWorklist.insert(&F.getEntryBlock());
2796
2797 // Note that we *must not* cache the size, this loop grows the worklist.
2798 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2799 if (shouldStop())
2800 break;
2801
2802 BasicBlock *BB = BBWorklist[Idx];
2803 if (BB->empty())
2804 continue;
2805
2806 onBlockStart(BB);
2807
2808 // Disallow inlining a blockaddress with uses other than strictly callbr.
2809 // A blockaddress only has defined behavior for an indirect branch in the
2810 // same function, and we do not currently support inlining indirect
2811 // branches. But, the inliner may not see an indirect branch that ends up
2812 // being dead code at a particular call site. If the blockaddress escapes
2813 // the function, e.g., via a global variable, inlining may lead to an
2814 // invalid cross-function reference.
2815 // FIXME: pr/39560: continue relaxing this overt restriction.
2816 if (BB->hasAddressTaken())
2817 for (User *U : BlockAddress::get(&*BB)->users())
2818 if (!isa<CallBrInst>(*U))
2819 return InlineResult::failure("blockaddress used outside of callbr");
2820
2821 // Analyze the cost of this block. If we blow through the threshold, this
2822 // returns false, and we can bail on out.
2823 InlineResult IR = analyzeBlock(BB, EphValues);
2824 if (!IR.isSuccess())
2825 return IR;
2826
2827 Instruction *TI = BB->getTerminator();
2828
2829 // Add in the live successors by first checking whether we have terminator
2830 // that may be simplified based on the values simplified by this call.
2831 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
2832 if (BI->isConditional()) {
2833 Value *Cond = BI->getCondition();
2834 if (ConstantInt *SimpleCond =
2835 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2836 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
2837 BBWorklist.insert(NextBB);
2838 KnownSuccessors[BB] = NextBB;
2839 findDeadBlocks(BB, NextBB);
2840 continue;
2841 }
2842 }
2843 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
2844 Value *Cond = SI->getCondition();
2845 if (ConstantInt *SimpleCond =
2846 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2847 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
2848 BBWorklist.insert(NextBB);
2849 KnownSuccessors[BB] = NextBB;
2850 findDeadBlocks(BB, NextBB);
2851 continue;
2852 }
2853 }
2854
2855 // If we're unable to select a particular successor, just count all of
2856 // them.
2857 for (BasicBlock *Succ : successors(BB))
2858 BBWorklist.insert(Succ);
2859
2860 onBlockAnalyzed(BB);
2861 }
2862
2863 // If this is a noduplicate call, we can still inline as long as
2864 // inlining this would cause the removal of the caller (so the instruction
2865 // is not actually duplicated, just moved).
2866 if (!isSoleCallToLocalFunction(CandidateCall, F) && ContainsNoDuplicateCall)
2867 return InlineResult::failure("noduplicate");
2868
2869 // If the callee's stack size exceeds the user-specified threshold,
2870 // do not let it be inlined.
2871 // The command line option overrides a limit set in the function attributes.
2872 size_t FinalStackSizeThreshold = StackSizeThreshold;
2873 if (!StackSizeThreshold.getNumOccurrences())
2874 if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
2876 FinalStackSizeThreshold = *AttrMaxStackSize;
2877 if (AllocatedSize > FinalStackSizeThreshold)
2878 return InlineResult::failure("stacksize");
2879
2880 return finalizeAnalysis();
2881}
2882
2883void InlineCostCallAnalyzer::print(raw_ostream &OS) {
2884#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
2886 F.print(OS, &Writer);
2887 DEBUG_PRINT_STAT(NumConstantArgs);
2888 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2889 DEBUG_PRINT_STAT(NumAllocaArgs);
2890 DEBUG_PRINT_STAT(NumConstantPtrCmps);
2891 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2892 DEBUG_PRINT_STAT(NumInstructionsSimplified);
2893 DEBUG_PRINT_STAT(NumInstructions);
2894 DEBUG_PRINT_STAT(SROACostSavings);
2895 DEBUG_PRINT_STAT(SROACostSavingsLost);
2896 DEBUG_PRINT_STAT(LoadEliminationCost);
2897 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2899 DEBUG_PRINT_STAT(Threshold);
2900#undef DEBUG_PRINT_STAT
2901}
2902
2903#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2904/// Dump stats about this call's analysis.
2905LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
2906#endif
2907
2908/// Test that there are no attribute conflicts between Caller and Callee
2909/// that prevent inlining.
2911 Function *Caller, Function *Callee, TargetTransformInfo &TTI,
2912 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
2913 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2914 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2915 // object, and always returns the same object (which is overwritten on each
2916 // GetTLI call). Therefore we copy the first result.
2917 auto CalleeTLI = GetTLI(*Callee);
2918 return (IgnoreTTIInlineCompatible ||
2919 TTI.areInlineCompatible(Caller, Callee)) &&
2920 GetTLI(*Caller).areInlineCompatible(CalleeTLI,
2922 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
2923}
2924
2926 const DataLayout &DL) {
2927 int64_t Cost = 0;
2928 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2929 if (Call.isByValArgument(I)) {
2930 // We approximate the number of loads and stores needed by dividing the
2931 // size of the byval type by the target's pointer size.
2932 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2933 unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));
2934 unsigned AS = PTy->getAddressSpace();
2935 unsigned PointerSize = DL.getPointerSizeInBits(AS);
2936 // Ceiling division.
2937 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2938
2939 // If it generates more than 8 stores it is likely to be expanded as an
2940 // inline memcpy so we take that as an upper bound. Otherwise we assume
2941 // one load and one store per word copied.
2942 // FIXME: The maxStoresPerMemcpy setting from the target should be used
2943 // here instead of a magic number of 8, but it's not available via
2944 // DataLayout.
2945 NumStores = std::min(NumStores, 8U);
2946
2947 Cost += 2 * NumStores * InstrCost;
2948 } else {
2949 // For non-byval arguments subtract off one instruction per call
2950 // argument.
2951 Cost += InstrCost;
2952 }
2953 }
2954 // The call instruction also disappears after inlining.
2955 Cost += InstrCost;
2956 Cost += TTI.getInlineCallPenalty(Call.getCaller(), Call, CallPenalty);
2957
2958 return std::min<int64_t>(Cost, INT_MAX);
2959}
2960
2962 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2963 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2964 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2967 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2968 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);
2969}
2970
2972 CallBase &Call, TargetTransformInfo &CalleeTTI,
2973 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2975 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2977 const InlineParams Params = {/* DefaultThreshold*/ 0,
2978 /*HintThreshold*/ {},
2979 /*ColdThreshold*/ {},
2980 /*OptSizeThreshold*/ {},
2981 /*OptMinSizeThreshold*/ {},
2982 /*HotCallSiteThreshold*/ {},
2983 /*LocallyHotCallSiteThreshold*/ {},
2984 /*ColdCallSiteThreshold*/ {},
2985 /*ComputeFullInlineCost*/ true,
2986 /*EnableDeferral*/ true};
2987
2988 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
2989 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, true,
2990 /*IgnoreThreshold*/ true);
2991 auto R = CA.analyze();
2992 if (!R.isSuccess())
2993 return std::nullopt;
2994 return CA.getCost();
2995}
2996
2997std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
2998 CallBase &Call, TargetTransformInfo &CalleeTTI,
2999 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3001 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3003 InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, GetTLI,
3004 PSI, ORE, *Call.getCalledFunction(), Call);
3005 auto R = CFA.analyze();
3006 if (!R.isSuccess())
3007 return std::nullopt;
3008 return CFA.features();
3009}
3010
3012 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
3013 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
3014
3015 // Cannot inline indirect calls.
3016 if (!Callee)
3017 return InlineResult::failure("indirect call");
3018
3019 // When callee coroutine function is inlined into caller coroutine function
3020 // before coro-split pass,
3021 // coro-early pass can not handle this quiet well.
3022 // So we won't inline the coroutine function if it have not been unsplited
3023 if (Callee->isPresplitCoroutine())
3024 return InlineResult::failure("unsplited coroutine call");
3025
3026 // Never inline calls with byval arguments that does not have the alloca
3027 // address space. Since byval arguments can be replaced with a copy to an
3028 // alloca, the inlined code would need to be adjusted to handle that the
3029 // argument is in the alloca address space (so it is a little bit complicated
3030 // to solve).
3031 unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace();
3032 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
3033 if (Call.isByValArgument(I)) {
3034 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
3035 if (PTy->getAddressSpace() != AllocaAS)
3036 return InlineResult::failure("byval arguments without alloca"
3037 " address space");
3038 }
3039
3040 // Calls to functions with always-inline attributes should be inlined
3041 // whenever possible.
3042 if (Call.hasFnAttr(Attribute::AlwaysInline)) {
3043 if (Call.getAttributes().hasFnAttr(Attribute::NoInline))
3044 return InlineResult::failure("noinline call site attribute");
3045
3046 auto IsViable = isInlineViable(*Callee);
3047 if (IsViable.isSuccess())
3048 return InlineResult::success();
3049 return InlineResult::failure(IsViable.getFailureReason());
3050 }
3051
3052 // Never inline functions with conflicting attributes (unless callee has
3053 // always-inline attribute).
3054 Function *Caller = Call.getCaller();
3055 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
3056 return InlineResult::failure("conflicting attributes");
3057
3058 // Don't inline this call if the caller has the optnone attribute.
3059 if (Caller->hasOptNone())
3060 return InlineResult::failure("optnone attribute");
3061
3062 // Don't inline a function that treats null pointer as valid into a caller
3063 // that does not have this attribute.
3064 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
3065 return InlineResult::failure("nullptr definitions incompatible");
3066
3067 // Don't inline functions which can be interposed at link-time.
3068 if (Callee->isInterposable())
3069 return InlineResult::failure("interposable");
3070
3071 // Don't inline functions marked noinline.
3072 if (Callee->hasFnAttribute(Attribute::NoInline))
3073 return InlineResult::failure("noinline function attribute");
3074
3075 // Don't inline call sites marked noinline.
3076 if (Call.isNoInline())
3077 return InlineResult::failure("noinline call site attribute");
3078
3079 return std::nullopt;
3080}
3081
3083 CallBase &Call, Function *Callee, const InlineParams &Params,
3084 TargetTransformInfo &CalleeTTI,
3085 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3086 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3089
3090 auto UserDecision =
3091 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3092
3093 if (UserDecision) {
3094 if (UserDecision->isSuccess())
3095 return llvm::InlineCost::getAlways("always inline attribute");
3096 return llvm::InlineCost::getNever(UserDecision->getFailureReason());
3097 }
3098
3099 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
3100 << "... (caller:" << Call.getCaller()->getName()
3101 << ")\n");
3102
3103 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3104 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE);
3105 InlineResult ShouldInline = CA.analyze();
3106
3107 LLVM_DEBUG(CA.dump());
3108
3109 // Always make cost benefit based decision explicit.
3110 // We use always/never here since threshold is not meaningful,
3111 // as it's not what drives cost-benefit analysis.
3112 if (CA.wasDecidedByCostBenefit()) {
3113 if (ShouldInline.isSuccess())
3114 return InlineCost::getAlways("benefit over cost",
3115 CA.getCostBenefitPair());
3116 else
3117 return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());
3118 }
3119
3120 if (CA.wasDecidedByCostThreshold())
3121 return InlineCost::get(CA.getCost(), CA.getThreshold(),
3122 CA.getStaticBonusApplied());
3123
3124 // No details on how the decision was made, simply return always or never.
3125 return ShouldInline.isSuccess()
3126 ? InlineCost::getAlways("empty function")
3127 : InlineCost::getNever(ShouldInline.getFailureReason());
3128}
3129
3131 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
3132 for (BasicBlock &BB : F) {
3133 // Disallow inlining of functions which contain indirect branches.
3134 if (isa<IndirectBrInst>(BB.getTerminator()))
3135 return InlineResult::failure("contains indirect branches");
3136
3137 // Disallow inlining of blockaddresses which are used by non-callbr
3138 // instructions.
3139 if (BB.hasAddressTaken())
3140 for (User *U : BlockAddress::get(&BB)->users())
3141 if (!isa<CallBrInst>(*U))
3142 return InlineResult::failure("blockaddress used outside of callbr");
3143
3144 for (auto &II : BB) {
3145 CallBase *Call = dyn_cast<CallBase>(&II);
3146 if (!Call)
3147 continue;
3148
3149 // Disallow recursive calls.
3150 Function *Callee = Call->getCalledFunction();
3151 if (&F == Callee)
3152 return InlineResult::failure("recursive call");
3153
3154 // Disallow calls which expose returns-twice to a function not previously
3155 // attributed as such.
3156 if (!ReturnsTwice && isa<CallInst>(Call) &&
3157 cast<CallInst>(Call)->canReturnTwice())
3158 return InlineResult::failure("exposes returns-twice attribute");
3159
3160 if (Callee)
3161 switch (Callee->getIntrinsicID()) {
3162 default:
3163 break;
3164 case llvm::Intrinsic::icall_branch_funnel:
3165 // Disallow inlining of @llvm.icall.branch.funnel because current
3166 // backend can't separate call targets from call arguments.
3167 return InlineResult::failure(
3168 "disallowed inlining of @llvm.icall.branch.funnel");
3169 case llvm::Intrinsic::localescape:
3170 // Disallow inlining functions that call @llvm.localescape. Doing this
3171 // correctly would require major changes to the inliner.
3172 return InlineResult::failure(
3173 "disallowed inlining of @llvm.localescape");
3174 case llvm::Intrinsic::vastart:
3175 // Disallow inlining of functions that initialize VarArgs with
3176 // va_start.
3177 return InlineResult::failure(
3178 "contains VarArgs initialized with va_start");
3179 }
3180 }
3181 }
3182
3183 return InlineResult::success();
3184}
3185
3186// APIs to create InlineParams based on command line flags and/or other
3187// parameters.
3188
3190 InlineParams Params;
3191
3192 // This field is the threshold to use for a callee by default. This is
3193 // derived from one or more of:
3194 // * optimization or size-optimization levels,
3195 // * a value passed to createFunctionInliningPass function, or
3196 // * the -inline-threshold flag.
3197 // If the -inline-threshold flag is explicitly specified, that is used
3198 // irrespective of anything else.
3199 if (InlineThreshold.getNumOccurrences() > 0)
3201 else
3202 Params.DefaultThreshold = Threshold;
3203
3204 // Set the HintThreshold knob from the -inlinehint-threshold.
3206
3207 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3209
3210 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3211 // populate LocallyHotCallSiteThreshold. Later, we populate
3212 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3213 // we know that optimization level is O3 (in the getInlineParams variant that
3214 // takes the opt and size levels).
3215 // FIXME: Remove this check (and make the assignment unconditional) after
3216 // addressing size regression issues at O2.
3217 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3219
3220 // Set the ColdCallSiteThreshold knob from the
3221 // -inline-cold-callsite-threshold.
3223
3224 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3225 // -inlinehint-threshold commandline option is not explicitly given. If that
3226 // option is present, then its value applies even for callees with size and
3227 // minsize attributes.
3228 // If the -inline-threshold is not specified, set the ColdThreshold from the
3229 // -inlinecold-threshold even if it is not explicitly passed. If
3230 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3231 // explicitly specified to set the ColdThreshold knob
3232 if (InlineThreshold.getNumOccurrences() == 0) {
3236 } else if (ColdThreshold.getNumOccurrences() > 0) {
3238 }
3239 return Params;
3240}
3241
3244}
3245
3246// Compute the default threshold for inlining based on the opt level and the
3247// size opt level.
3248static int computeThresholdFromOptLevels(unsigned OptLevel,
3249 unsigned SizeOptLevel) {
3250 if (OptLevel > 2)
3252 if (SizeOptLevel == 1) // -Os
3254 if (SizeOptLevel == 2) // -Oz
3256 return DefaultThreshold;
3257}
3258
3259InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3260 auto Params =
3261 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3262 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3263 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3264 // when it is specified explicitly.
3265 if (OptLevel > 2)
3267 return Params;
3268}
3269
3274 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3275 [&](Function &F) -> AssumptionCache & {
3277 };
3278 Module *M = F.getParent();
3279 ProfileSummaryInfo PSI(*M);
3280 TargetTransformInfo TTI(M->getDataLayout());
3281 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3282 // In the current implementation, the type of InlineParams doesn't matter as
3283 // the pass serves only for verification of inliner's decisions.
3284 // We can add a flag which determines InlineParams for this run. Right now,
3285 // the default InlineParams are used.
3286 const InlineParams Params = llvm::getInlineParams();
3287 for (BasicBlock &BB : F) {
3288 for (Instruction &I : BB) {
3289 if (auto *CB = dyn_cast<CallBase>(&I)) {
3290 Function *CalledFunction = CB->getCalledFunction();
3291 if (!CalledFunction || CalledFunction->isDeclaration())
3292 continue;
3293 OptimizationRemarkEmitter ORE(CalledFunction);
3294 InlineCostCallAnalyzer ICCA(*CalledFunction, *CB, Params, TTI,
3295 GetAssumptionCache, nullptr, nullptr, &PSI,
3296 &ORE);
3297 ICCA.analyze();
3298 OS << " Analyzing call of " << CalledFunction->getName()
3299 << "... (caller:" << CB->getCaller()->getName() << ")\n";
3300 ICCA.print(OS);
3301 OS << "\n";
3302 }
3303 }
3304 }
3305 return PreservedAnalyses::all();
3306}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Size
static bool isColdCallSite(CallBase &CB, BlockFrequencyInfo &CallerBFI)
Return true if the block containing the call site has a BlockFrequency of less than ColdCCRelFreq% of...
Definition: GlobalOpt.cpp:1735
Hexagon Common GEP
iv users
Definition: IVUsers.cpp:48
static cl::opt< int > InlineSavingsMultiplier("inline-savings-multiplier", cl::Hidden, cl::init(8), cl::desc("Multiplier to multiply cycle savings by during inlining"))
static cl::opt< int > InlineThreshold("inline-threshold", cl::Hidden, cl::init(225), cl::desc("Control the amount of inlining to perform (default = 225)"))
static cl::opt< int > CallPenalty("inline-call-penalty", cl::Hidden, cl::init(25), cl::desc("Call penalty that is applied per callsite when inlining"))
static cl::opt< int > HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), cl::desc("Threshold for hot callsites "))
static cl::opt< int > ColdThreshold("inlinecold-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining functions with cold attribute"))
static cl::opt< size_t > RecurStackSizeThreshold("recursive-inline-max-stacksize", cl::Hidden, cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller), cl::desc("Do not inline recursive functions with a stack " "size that exceeds the specified limit"))
static cl::opt< bool > PrintInstructionComments("print-instruction-comments", cl::Hidden, cl::init(false), cl::desc("Prints comments for instruction based on inline cost analysis"))
static cl::opt< int > LocallyHotCallSiteThreshold("locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::desc("Threshold for locally hot callsites "))
static cl::opt< bool > InlineCallerSupersetNoBuiltin("inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true), cl::desc("Allow inlining when caller has a superset of callee's nobuiltin " "attributes."))
static cl::opt< int > HintThreshold("inlinehint-threshold", cl::Hidden, cl::init(325), cl::desc("Threshold for inlining functions with inline hint"))
static cl::opt< size_t > StackSizeThreshold("inline-max-stacksize", cl::Hidden, cl::init(std::numeric_limits< size_t >::max()), cl::desc("Do not inline functions with a stack size " "that exceeds the specified limit"))
static int computeThresholdFromOptLevels(unsigned OptLevel, unsigned SizeOptLevel)
static cl::opt< uint64_t > HotCallSiteRelFreq("hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::desc("Minimum block frequency, expressed as a multiple of caller's " "entry frequency, for a callsite to be hot in the absence of " "profile information."))
static cl::opt< int > InlineSavingsProfitableMultiplier("inline-savings-profitable-multiplier", cl::Hidden, cl::init(4), cl::desc("A multiplier on top of cycle savings to decide whether the " "savings won't justify the cost"))
static cl::opt< int > MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0), cl::desc("Cost of load/store instruction when inlining"))
static cl::opt< int > ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining cold callsites"))
static cl::opt< bool > IgnoreTTIInlineCompatible("ignore-tti-inline-compatible", cl::Hidden, cl::init(false), cl::desc("Ignore TTI attributes compatibility check between callee/caller " "during inline cost calculation"))
static cl::opt< bool > OptComputeFullInlineCost("inline-cost-full", cl::Hidden, cl::desc("Compute the full inline cost of a call site even when the cost " "exceeds the threshold."))
#define DEBUG_PRINT_STAT(x)
static cl::opt< bool > InlineEnableCostBenefitAnalysis("inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false), cl::desc("Enable the cost-benefit analysis for the inliner"))
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
static cl::opt< int > InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100), cl::desc("The maximum size of a callee that get's " "inlined without sufficient cycle savings"))
static bool functionsHaveCompatibleAttributes(Function *Caller, Function *Callee, TargetTransformInfo &TTI, function_ref< const TargetLibraryInfo &(Function &)> &GetTLI)
Test that there are no attribute conflicts between Caller and Callee that prevent inlining.
#define DEBUG_TYPE
Definition: InlineCost.cpp:52
static cl::opt< int > ColdCallSiteRelFreq("cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::desc("Maximum block frequency, expressed as a percentage of caller's " "entry frequency, for a callsite to be cold in the absence of " "profile information."))
static cl::opt< bool > DisableGEPConstOperand("disable-gep-const-evaluation", cl::Hidden, cl::init(false), cl::desc("Disables evaluation of GetElementPtr with constant operands"))
static cl::opt< int > DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225), cl::desc("Default amount of inlining to perform"))
static Constant * getFalse(Type *Ty)
For a boolean type or a vector of boolean type, return false or a vector with every element false.
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
uint64_t IntrinsicInst * II
#define P(N)
FunctionAnalysisManager FAM
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getFastMathFlags(const MachineInstr &I)
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition: APInt.cpp:1547
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1111
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
an instruction to allocate memory on the stack
Definition: Instructions.h:63
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
virtual void emitInstructionAnnot(const Instruction *, formatted_raw_ostream &)
emitInstructionAnnot - This may be implemented to emit a string right before an instruction is emitte...
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:392
AttrKind
This enumeration lists the attributes that can be associated with parameters, function results,...
Definition: Attributes.h:86
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition: Attributes.h:208
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
bool empty() const
Definition: BasicBlock.h:470
bool hasAddressTaken() const
Returns true if there are any uses of this basic block other than direct branches,...
Definition: BasicBlock.h:658
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
This class represents a no-op cast from one type to another.
static BlockAddress * get(Function *F, BasicBlock *BB)
Return a BlockAddress for the specified function and basic block.
Definition: Constants.cpp:1897
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
std::optional< uint64_t > getBlockProfileCount(const BasicBlock *BB, bool AllowSynthetic=false) const
Returns the estimated profile count of BB.
BlockFrequency getEntryFreq() const
BlockFrequency getBlockFreq(const BasicBlock *BB) const
getblockFreq - Return block frequency.
std::optional< BlockFrequency > mul(uint64_t Factor) const
Multiplies frequency with Factor. Returns nullopt in case of overflow.
Conditional or Unconditional Branch instruction.
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
bool isUnconditional() const
Value * getCondition() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1349
bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const
Determine whether the argument or parameter has the given attribute.
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1269
Attribute getFnAttr(StringRef Kind) const
Get the attribute of a given kind for the function.
Definition: InstrTypes.h:1613
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1294
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1275
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1207
Function * getCaller()
Helper to get the caller (the parent function).
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:444
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:661
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
static Constant * getSub(Constant *C1, Constant *C2, bool HasNUW=false, bool HasNSW=false)
Definition: Constants.cpp:2644
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:148
static ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:880
This is an important base class in LLVM.
Definition: Constant.h:42
bool isAllOnesValue() const
Return true if this is the value that would be returned by getAllOnesValue.
Definition: Constants.cpp:107
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
unsigned size() const
Definition: DenseMap.h:99
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
This instruction extracts a struct member or array element value from an aggregate value.
Type * getReturnType() const
Definition: DerivedTypes.h:126
Class to represent profile counts.
Definition: Function.h:292
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:296
static bool compare(const APInt &LHS, const APInt &RHS, ICmpInst::Predicate Pred)
Return result of LHS Pred RHS comparison.
Indirect Branch Instruction.
Represents the cost of inlining a function.
Definition: InlineCost.h:89
static InlineCost getNever(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:130
static InlineCost getAlways(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:125
static InlineCost get(int Cost, int Threshold, int StaticBonus=0)
Definition: InlineCost.h:119
InlineResult is basically true or false.
Definition: InlineCost.h:179
static InlineResult success()
Definition: InlineCost.h:184
static InlineResult failure(const char *Reason)
Definition: InlineCost.h:185
bool isSuccess() const
Definition: InlineCost.h:188
const char * getFailureReason() const
Definition: InlineCost.h:189
This instruction inserts a struct field of array element value into an aggregate value.
Base class for instruction visitors.
Definition: InstVisitor.h:78
RetTy visitIndirectBrInst(IndirectBrInst &I)
Definition: InstVisitor.h:238
RetTy visitCmpInst(CmpInst &I)
Definition: InstVisitor.h:265
RetTy visitCallBase(CallBase &I)
Definition: InstVisitor.h:270
RetTy visitCleanupReturnInst(CleanupReturnInst &I)
Definition: InstVisitor.h:247
RetTy visitUnreachableInst(UnreachableInst &I)
Definition: InstVisitor.h:244
RetTy visitSwitchInst(SwitchInst &I)
Definition: InstVisitor.h:235
void visit(Iterator Start, Iterator End)
Definition: InstVisitor.h:87
RetTy visitReturnInst(ReturnInst &I)
Definition: InstVisitor.h:229
RetTy visitBinaryOperator(BinaryOperator &I)
Definition: InstVisitor.h:264
RetTy visitResumeInst(ResumeInst &I)
Definition: InstVisitor.h:241
RetTy visitCatchReturnInst(CatchReturnInst &I)
Definition: InstVisitor.h:250
RetTy visitCastInst(CastInst &I)
Definition: InstVisitor.h:262
RetTy visitBranchInst(BranchInst &I)
Definition: InstVisitor.h:232
RetTy visitSelectInst(SelectInst &I)
Definition: InstVisitor.h:189
void visitInstruction(Instruction &I)
Definition: InstVisitor.h:283
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:386
This class represents a cast from an integer to a pointer.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
Invoke instruction.
An instruction for reading from memory.
Definition: Instructions.h:176
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Class to represent pointers.
Definition: DerivedTypes.h:670
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:703
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
Analysis providing profile information.
This class represents a cast from a pointer to an integer.
Resume the propagation of an exception.
Return a value (possibly void), from a function.
This class represents the LLVM 'select' instruction.
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:81
void reserve(size_type N)
Definition: SmallVector.h:663
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:470
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition: DataLayout.h:567
TypeSize getElementOffset(unsigned Idx) const
Definition: DataLayout.h:596
Class to represent struct types.
Definition: DerivedTypes.h:218
Multiway switch.
Provides information about what library functions are available for the current target.
bool has(LibFunc F) const
Tests whether a library function is available.
bool getLibFunc(StringRef funcName, LibFunc &F) const
Searches for a particular function name.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
Returns a penalty for invoking call Call in F.
unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
@ TCK_SizeAndLatency
The weighted sum of size and latency.
unsigned adjustInliningThreshold(const CallBase *CB) const
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
bool isLoweredToCall(const Function *F) const
Test whether calls to a function lower to actual program function calls.
unsigned getInliningThresholdMultiplier() const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getFPOpCost(Type *Ty) const
Return the expected cost of supporting the floating point operation of the specified type.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
bool erase(const ValueT &V)
Definition: DenseSet.h:97
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
formatted_raw_ostream - A raw_ostream that wraps another one and keeps track of line and column posit...
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
bool areInlineCompatible(const Function &Caller, const Function &Callee)
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
const int ColdccPenalty
Definition: InlineCost.h:50
const char FunctionInlineCostMultiplierAttributeName[]
Definition: InlineCost.h:58
const int OptSizeThreshold
Use when optsize (-Os) is specified.
Definition: InlineCost.h:38
const int OptMinSizeThreshold
Use when minsize (-Oz) is specified.
Definition: InlineCost.h:41
const int LoopPenalty
Definition: InlineCost.h:49
const uint64_t MaxSimplifiedDynamicAllocaToInline
Do not inline dynamic allocas that have been constant propagated to be static allocas above this amou...
Definition: InlineCost.h:56
const int IndirectCallThreshold
Definition: InlineCost.h:48
const int OptAggressiveThreshold
Use when -O3 is specified.
Definition: InlineCost.h:44
const char MaxInlineStackSizeAttributeName[]
Definition: InlineCost.h:61
const unsigned TotalAllocaSizeRecursiveCaller
Do not inline functions which allocate this many bytes on the stack when the caller is recursive.
Definition: InlineCost.h:53
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
@ Dead
Unused definition.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:480
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
Constant * ConstantFoldSelectInstruction(Constant *Cond, Constant *V1, Constant *V2)
Attempt to constant fold a select instruction with the specified operands.
bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
bool canConstantFoldCallTo(const CallBase *Call, const Function *F)
canConstantFoldCallTo - Return true if its even possible to fold a call to the specified function.
Function::ProfileCount ProfileCount
std::optional< int > getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind)
Definition: InlineCost.cpp:186
auto successors(const MachineBasicBlock *BB)
Value * lowerObjectSizeCall(IntrinsicInst *ObjectSize, const DataLayout &DL, const TargetLibraryInfo *TLI, bool MustSucceed)
Try to turn a call to @llvm.objectsize into an integer value of the given Type.
LogicalResult failure(bool IsFailure=true)
Utility function to generate a LogicalResult.
Definition: LogicalResult.h:61
gep_type_iterator gep_type_end(const User *GEP)
Constant * ConstantFoldCall(const CallBase *Call, Function *F, ArrayRef< Constant * > Operands, const TargetLibraryInfo *TLI=nullptr, bool AllowNonDeterministic=true)
ConstantFoldCall - Attempt to constant fold a call to the specified function with the specified argum...
Value * simplifyInstruction(Instruction *I, const SimplifyQuery &Q)
See if we can compute a simplified version of this instruction.
InlineResult isInlineViable(Function &Callee)
Minimal filter to detect invalid constructs for inlining.
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
Value * simplifyFNegInst(Value *Op, FastMathFlags FMF, const SimplifyQuery &Q)
Given operand for an FNeg, fold the result or return null.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
std::enable_if_t< std::is_unsigned_v< T >, T > SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed=nullptr)
Multiply two unsigned integers, X and Y, and add the unsigned integer, A to the product.
Definition: MathExtras.h:684
std::optional< InlineCostFeatures > getInliningCostFeatures(CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, function_ref< const TargetLibraryInfo &(Function &)> GetTLI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get the expanded cost features.
Constant * ConstantFoldInstOperands(Instruction *I, ArrayRef< Constant * > Ops, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, bool AllowNonDeterministic=true)
ConstantFoldInstOperands - Attempt to constant fold an instruction with the specified operands.
std::array< int, static_cast< size_t >(InlineCostFeatureIndex::NumberOfFeatures)> InlineCostFeatures
TargetTransformInfo TTI
std::optional< InlineResult > getAttributeBasedInliningDecision(CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, function_ref< const TargetLibraryInfo &(Function &)> GetTLI)
Returns InlineResult::success() if the call site should be always inlined because of user directives,...
Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
int getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call, const DataLayout &DL)
Return the cost associated with a callsite, including parameter passing and the call/return instructi...
gep_type_iterator gep_type_begin(const User *GEP)
std::optional< int > getInliningCostEstimate(CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, function_ref< const TargetLibraryInfo &(Function &)> GetTLI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get the cost estimate ignoring thresholds.
auto predecessors(const MachineBasicBlock *BB)
std::enable_if_t< std::is_unsigned_v< T >, T > SaturatingAdd(T X, T Y, bool *ResultOverflowed=nullptr)
Add two unsigned integers, X and Y, of type T.
Definition: MathExtras.h:609
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:205
std::optional< int > OptMinSizeThreshold
Threshold to use when the caller is optimized for minsize.
Definition: InlineCost.h:219
std::optional< int > OptSizeThreshold
Threshold to use when the caller is optimized for size.
Definition: InlineCost.h:216
std::optional< int > ColdCallSiteThreshold
Threshold to use when the callsite is considered cold.
Definition: InlineCost.h:229
std::optional< int > ColdThreshold
Threshold to use for cold callees.
Definition: InlineCost.h:213
std::optional< int > HotCallSiteThreshold
Threshold to use when the callsite is considered hot.
Definition: InlineCost.h:222
std::optional< bool > AllowRecursiveCall
Indicate whether we allow inlining for recursive call.
Definition: InlineCost.h:238
int DefaultThreshold
The default threshold to start with for a callee.
Definition: InlineCost.h:207
std::optional< int > HintThreshold
Threshold to use for callees with inline hint.
Definition: InlineCost.h:210
std::optional< int > LocallyHotCallSiteThreshold
Threshold to use when the callsite is considered hot relative to function entry.
Definition: InlineCost.h:226