LLVM 19.0.0git
SampleProfile.cpp
Go to the documentation of this file.
1//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the SampleProfileLoader transformation. This pass
10// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
11// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
12// profile information in the given profile.
13//
14// This pass generates branch weight annotations on the IR:
15//
16// - prof: Represents branch weights. This annotation is added to branches
17// to indicate the weights of each edge coming out of the branch.
18// The weight of each edge is the weight of the target block for
19// that edge. The weight of a block B is computed as the maximum
20// number of samples found in B.
21//
22//===----------------------------------------------------------------------===//
23
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/ADT/DenseMap.h"
27#include "llvm/ADT/DenseSet.h"
28#include "llvm/ADT/MapVector.h"
32#include "llvm/ADT/Statistic.h"
33#include "llvm/ADT/StringMap.h"
34#include "llvm/ADT/StringRef.h"
35#include "llvm/ADT/Twine.h"
46#include "llvm/IR/BasicBlock.h"
47#include "llvm/IR/DebugLoc.h"
49#include "llvm/IR/Function.h"
50#include "llvm/IR/GlobalValue.h"
51#include "llvm/IR/InstrTypes.h"
52#include "llvm/IR/Instruction.h"
55#include "llvm/IR/LLVMContext.h"
56#include "llvm/IR/MDBuilder.h"
57#include "llvm/IR/Module.h"
58#include "llvm/IR/PassManager.h"
60#include "llvm/IR/PseudoProbe.h"
67#include "llvm/Support/Debug.h"
71#include "llvm/Transforms/IPO.h"
82#include <algorithm>
83#include <cassert>
84#include <cstdint>
85#include <functional>
86#include <limits>
87#include <map>
88#include <memory>
89#include <queue>
90#include <string>
91#include <system_error>
92#include <utility>
93#include <vector>
94
95using namespace llvm;
96using namespace sampleprof;
97using namespace llvm::sampleprofutil;
99#define DEBUG_TYPE "sample-profile"
100#define CSINLINE_DEBUG DEBUG_TYPE "-inline"
101
102STATISTIC(NumCSInlined,
103 "Number of functions inlined with context sensitive profile");
104STATISTIC(NumCSNotInlined,
105 "Number of functions not inlined with context sensitive profile");
106STATISTIC(NumMismatchedProfile,
107 "Number of functions with CFG mismatched profile");
108STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
109STATISTIC(NumDuplicatedInlinesite,
110 "Number of inlined callsites with a partial distribution factor");
111
112STATISTIC(NumCSInlinedHitMinLimit,
113 "Number of functions with FDO inline stopped due to min size limit");
114STATISTIC(NumCSInlinedHitMaxLimit,
115 "Number of functions with FDO inline stopped due to max size limit");
117 NumCSInlinedHitGrowthLimit,
118 "Number of functions with FDO inline stopped due to growth size limit");
119
120// Command line option to specify the file to read samples from. This is
121// mainly used for debugging.
123 "sample-profile-file", cl::init(""), cl::value_desc("filename"),
124 cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
125
126// The named file contains a set of transformations that may have been applied
127// to the symbol names between the program from which the sample data was
128// collected and the current program's symbols.
130 "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
131 cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
132
134 "salvage-stale-profile", cl::Hidden, cl::init(false),
135 cl::desc("Salvage stale profile by fuzzy matching and use the remapped "
136 "location for sample profile query."));
137
139 "report-profile-staleness", cl::Hidden, cl::init(false),
140 cl::desc("Compute and report stale profile statistical metrics."));
141
143 "persist-profile-staleness", cl::Hidden, cl::init(false),
144 cl::desc("Compute stale profile statistical metrics and write it into the "
145 "native object file(.llvm_stats section)."));
146
148 "profile-sample-accurate", cl::Hidden, cl::init(false),
149 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
150 "callsite and function as having 0 samples. Otherwise, treat "
151 "un-sampled callsites and functions conservatively as unknown. "));
152
154 "profile-sample-block-accurate", cl::Hidden, cl::init(false),
155 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
156 "branches and calls as having 0 samples. Otherwise, treat "
157 "them conservatively as unknown. "));
158
160 "profile-accurate-for-symsinlist", cl::Hidden, cl::init(true),
161 cl::desc("For symbols in profile symbol list, regard their profiles to "
162 "be accurate. It may be overriden by profile-sample-accurate. "));
163
165 "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
166 cl::desc("Merge past inlinee's profile to outline version if sample "
167 "profile loader decided not to inline a call site. It will "
168 "only be enabled when top-down order of profile loading is "
169 "enabled. "));
170
172 "sample-profile-top-down-load", cl::Hidden, cl::init(true),
173 cl::desc("Do profile annotation and inlining for functions in top-down "
174 "order of call graph during sample profile loading. It only "
175 "works for new pass manager. "));
176
177static cl::opt<bool>
178 UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden,
179 cl::desc("Process functions in a top-down order "
180 "defined by the profiled call graph when "
181 "-sample-profile-top-down-load is on."));
182
184 "sample-profile-inline-size", cl::Hidden, cl::init(false),
185 cl::desc("Inline cold call sites in profile loader if it's beneficial "
186 "for code size."));
187
188// Since profiles are consumed by many passes, turning on this option has
189// side effects. For instance, pre-link SCC inliner would see merged profiles
190// and inline the hot functions (that are skipped in this pass).
192 "disable-sample-loader-inlining", cl::Hidden, cl::init(false),
193 cl::desc("If true, artifically skip inline transformation in sample-loader "
194 "pass, and merge (or scale) profiles (as configured by "
195 "--sample-profile-merge-inlinee)."));
196
197namespace llvm {
199 SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,
200 cl::desc("Sort profiled recursion by edge weights."));
201
203 "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
204 cl::desc("The size growth ratio limit for proirity-based sample profile "
205 "loader inlining."));
206
208 "sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
209 cl::desc("The lower bound of size growth limit for "
210 "proirity-based sample profile loader inlining."));
211
213 "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
214 cl::desc("The upper bound of size growth limit for "
215 "proirity-based sample profile loader inlining."));
216
218 "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
219 cl::desc("Hot callsite threshold for proirity-based sample profile loader "
220 "inlining."));
221
223 "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
224 cl::desc("Threshold for inlining cold callsites"));
225} // namespace llvm
226
228 "sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25),
229 cl::desc(
230 "Relative hotness percentage threshold for indirect "
231 "call promotion in proirity-based sample profile loader inlining."));
232
234 "sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1),
235 cl::desc(
236 "Skip relative hotness check for ICP up to given number of targets."));
237
239 "hot-func-cutoff-for-staleness-error", cl::Hidden, cl::init(800000),
240 cl::desc("A function is considered hot for staleness error check if its "
241 "total sample count is above the specified percentile"));
242
244 "min-functions-for-staleness-error", cl::Hidden, cl::init(50),
245 cl::desc("Skip the check if the number of hot functions is smaller than "
246 "the specified number."));
247
249 "precent-mismatch-for-staleness-error", cl::Hidden, cl::init(80),
250 cl::desc("Reject the profile if the mismatch percent is higher than the "
251 "given number."));
252
254 "sample-profile-prioritized-inline", cl::Hidden,
255 cl::desc("Use call site prioritized inlining for sample profile loader."
256 "Currently only CSSPGO is supported."));
257
259 "sample-profile-use-preinliner", cl::Hidden,
260 cl::desc("Use the preinliner decisions stored in profile context."));
261
263 "sample-profile-recursive-inline", cl::Hidden,
264 cl::desc("Allow sample loader inliner to inline recursive calls."));
265
267 "sample-profile-remove-probe", cl::Hidden, cl::init(false),
268 cl::desc("Remove pseudo-probe after sample profile annotation."));
269
271 "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
272 cl::desc(
273 "Optimization remarks file containing inline remarks to be replayed "
274 "by inlining from sample profile loader."),
275 cl::Hidden);
276
278 "sample-profile-inline-replay-scope",
279 cl::init(ReplayInlinerSettings::Scope::Function),
280 cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function",
281 "Replay on functions that have remarks associated "
282 "with them (default)"),
283 clEnumValN(ReplayInlinerSettings::Scope::Module, "Module",
284 "Replay on the entire module")),
285 cl::desc("Whether inline replay should be applied to the entire "
286 "Module or just the Functions (default) that are present as "
287 "callers in remarks during sample profile inlining."),
288 cl::Hidden);
289
291 "sample-profile-inline-replay-fallback",
292 cl::init(ReplayInlinerSettings::Fallback::Original),
295 ReplayInlinerSettings::Fallback::Original, "Original",
296 "All decisions not in replay send to original advisor (default)"),
297 clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline,
298 "AlwaysInline", "All decisions not in replay are inlined"),
299 clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline",
300 "All decisions not in replay are not inlined")),
301 cl::desc("How sample profile inline replay treats sites that don't come "
302 "from the replay. Original: defers to original advisor, "
303 "AlwaysInline: inline all sites not in replay, NeverInline: "
304 "inline no sites not in replay"),
305 cl::Hidden);
306
308 "sample-profile-inline-replay-format",
309 cl::init(CallSiteFormat::Format::LineColumnDiscriminator),
311 clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"),
312 clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn",
313 "<Line Number>:<Column Number>"),
314 clEnumValN(CallSiteFormat::Format::LineDiscriminator,
315 "LineDiscriminator", "<Line Number>.<Discriminator>"),
316 clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator,
317 "LineColumnDiscriminator",
318 "<Line Number>:<Column Number>.<Discriminator> (default)")),
319 cl::desc("How sample profile inline replay file is formatted"), cl::Hidden);
320
322 MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden,
323 cl::desc("Max number of promotions for a single indirect "
324 "call callsite in sample profile loader"));
325
327 "overwrite-existing-weights", cl::Hidden, cl::init(false),
328 cl::desc("Ignore existing branch weights on IR and always overwrite."));
329
331 "annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false),
332 cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for "
333 "sample-profile inline pass name."));
334
335namespace llvm {
337}
338
339namespace {
340
341using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
342using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
343using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
344using EdgeWeightMap = DenseMap<Edge, uint64_t>;
345using BlockEdgeMap =
347
348class GUIDToFuncNameMapper {
349public:
350 GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
351 DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
352 : CurrentReader(Reader), CurrentModule(M),
353 CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
354 if (!CurrentReader.useMD5())
355 return;
356
357 for (const auto &F : CurrentModule) {
358 StringRef OrigName = F.getName();
359 CurrentGUIDToFuncNameMap.insert(
360 {Function::getGUID(OrigName), OrigName});
361
362 // Local to global var promotion used by optimization like thinlto
363 // will rename the var and add suffix like ".llvm.xxx" to the
364 // original local name. In sample profile, the suffixes of function
365 // names are all stripped. Since it is possible that the mapper is
366 // built in post-thin-link phase and var promotion has been done,
367 // we need to add the substring of function name without the suffix
368 // into the GUIDToFuncNameMap.
370 if (CanonName != OrigName)
371 CurrentGUIDToFuncNameMap.insert(
372 {Function::getGUID(CanonName), CanonName});
373 }
374
375 // Update GUIDToFuncNameMap for each function including inlinees.
376 SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
377 }
378
379 ~GUIDToFuncNameMapper() {
380 if (!CurrentReader.useMD5())
381 return;
382
383 CurrentGUIDToFuncNameMap.clear();
384
385 // Reset GUIDToFuncNameMap for of each function as they're no
386 // longer valid at this point.
387 SetGUIDToFuncNameMapForAll(nullptr);
388 }
389
390private:
391 void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
392 std::queue<FunctionSamples *> FSToUpdate;
393 for (auto &IFS : CurrentReader.getProfiles()) {
394 FSToUpdate.push(&IFS.second);
395 }
396
397 while (!FSToUpdate.empty()) {
398 FunctionSamples *FS = FSToUpdate.front();
399 FSToUpdate.pop();
400 FS->GUIDToFuncNameMap = Map;
401 for (const auto &ICS : FS->getCallsiteSamples()) {
402 const FunctionSamplesMap &FSMap = ICS.second;
403 for (const auto &IFS : FSMap) {
404 FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
405 FSToUpdate.push(&FS);
406 }
407 }
408 }
409 }
410
412 Module &CurrentModule;
413 DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
414};
415
416// Inline candidate used by iterative callsite prioritized inliner
417struct InlineCandidate {
418 CallBase *CallInstr;
419 const FunctionSamples *CalleeSamples;
420 // Prorated callsite count, which will be used to guide inlining. For example,
421 // if a callsite is duplicated in LTO prelink, then in LTO postlink the two
422 // copies will get their own distribution factors and their prorated counts
423 // will be used to decide if they should be inlined independently.
424 uint64_t CallsiteCount;
425 // Call site distribution factor to prorate the profile samples for a
426 // duplicated callsite. Default value is 1.0.
427 float CallsiteDistribution;
428};
429
430// Inline candidate comparer using call site weight
431struct CandidateComparer {
432 bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {
433 if (LHS.CallsiteCount != RHS.CallsiteCount)
434 return LHS.CallsiteCount < RHS.CallsiteCount;
435
436 const FunctionSamples *LCS = LHS.CalleeSamples;
437 const FunctionSamples *RCS = RHS.CalleeSamples;
438 assert(LCS && RCS && "Expect non-null FunctionSamples");
439
440 // Tie breaker using number of samples try to favor smaller functions first
441 if (LCS->getBodySamples().size() != RCS->getBodySamples().size())
442 return LCS->getBodySamples().size() > RCS->getBodySamples().size();
443
444 // Tie breaker using GUID so we have stable/deterministic inlining order
445 return LCS->getGUID() < RCS->getGUID();
446 }
447};
448
449using CandidateQueue =
451 CandidateComparer>;
452
453/// Sample profile pass.
454///
455/// This pass reads profile data from the file specified by
456/// -sample-profile-file and annotates every affected function with the
457/// profile information found in that file.
458class SampleProfileLoader final : public SampleProfileLoaderBaseImpl<Function> {
459public:
460 SampleProfileLoader(
461 StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase,
463 std::function<AssumptionCache &(Function &)> GetAssumptionCache,
464 std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
465 std::function<const TargetLibraryInfo &(Function &)> GetTLI)
467 std::move(FS)),
468 GetAC(std::move(GetAssumptionCache)),
469 GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
470 LTOPhase(LTOPhase),
471 AnnotatedPassName(AnnotateSampleProfileInlinePhase
474 : CSINLINE_DEBUG) {}
475
476 bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
477 bool runOnModule(Module &M, ModuleAnalysisManager *AM,
479
480protected:
482 bool emitAnnotations(Function &F);
484 const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
485 const FunctionSamples *
486 findFunctionSamples(const Instruction &I) const override;
487 std::vector<const FunctionSamples *>
488 findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
489 void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples,
490 DenseSet<GlobalValue::GUID> &InlinedGUIDs,
491 uint64_t Threshold);
492 // Attempt to promote indirect call and also inline the promoted call
493 bool tryPromoteAndInlineCandidate(
494 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
495 uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
496
497 bool inlineHotFunctions(Function &F,
498 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
499 std::optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB);
500 bool getExternalInlineAdvisorShouldInline(CallBase &CB);
501 InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
502 bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
503 bool
504 tryInlineCandidate(InlineCandidate &Candidate,
505 SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
506 bool
507 inlineHotFunctionsWithPriority(Function &F,
508 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
509 // Inline cold/small functions in addition to hot ones
510 bool shouldInlineColdCallee(CallBase &CallInst);
511 void emitOptimizationRemarksForInlineCandidates(
512 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
513 bool Hot);
514 void promoteMergeNotInlinedContextSamples(
516 const Function &F);
517 std::vector<Function *> buildFunctionOrder(Module &M, LazyCallGraph &CG);
518 std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(Module &M);
519 void generateMDProfMetadata(Function &F);
520 bool rejectHighStalenessProfile(Module &M, ProfileSummaryInfo *PSI,
521 const SampleProfileMap &Profiles);
522 void removePseudoProbeInsts(Module &M);
523
524 /// Map from function name to Function *. Used to find the function from
525 /// the function name. If the function name contains suffix, additional
526 /// entry is added to map from the stripped name to the function if there
527 /// is one-to-one mapping.
529
530 std::function<AssumptionCache &(Function &)> GetAC;
531 std::function<TargetTransformInfo &(Function &)> GetTTI;
532 std::function<const TargetLibraryInfo &(Function &)> GetTLI;
533
534 /// Profile tracker for different context.
535 std::unique_ptr<SampleContextTracker> ContextTracker;
536
537 /// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
538 ///
539 /// We need to know the LTO phase because for example in ThinLTOPrelink
540 /// phase, in annotation, we should not promote indirect calls. Instead,
541 /// we will mark GUIDs that needs to be annotated to the function.
542 const ThinOrFullLTOPhase LTOPhase;
543 const std::string AnnotatedPassName;
544
545 /// Profle Symbol list tells whether a function name appears in the binary
546 /// used to generate the current profile.
547 std::unique_ptr<ProfileSymbolList> PSL;
548
549 /// Total number of samples collected in this profile.
550 ///
551 /// This is the sum of all the samples collected in all the functions executed
552 /// at runtime.
553 uint64_t TotalCollectedSamples = 0;
554
555 // Information recorded when we declined to inline a call site
556 // because we have determined it is too cold is accumulated for
557 // each callee function. Initially this is just the entry count.
558 struct NotInlinedProfileInfo {
559 uint64_t entryCount;
560 };
562
563 // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
564 // all the function symbols defined or declared in current module.
565 DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
566
567 // All the Names used in FunctionSamples including outline function
568 // names, inline instance names and call target names.
569 StringSet<> NamesInProfile;
570 // MD5 version of NamesInProfile. Either NamesInProfile or GUIDsInProfile is
571 // populated, depends on whether the profile uses MD5. Because the name table
572 // generally contains several magnitude more entries than the number of
573 // functions, we do not want to convert all names from one form to another.
574 llvm::DenseSet<uint64_t> GUIDsInProfile;
575
576 // For symbol in profile symbol list, whether to regard their profiles
577 // to be accurate. It is mainly decided by existance of profile symbol
578 // list and -profile-accurate-for-symsinlist flag, but it can be
579 // overriden by -profile-sample-accurate or profile-sample-accurate
580 // attribute.
581 bool ProfAccForSymsInList;
582
583 // External inline advisor used to replay inline decision from remarks.
584 std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor;
585
586 // A helper to implement the sample profile matching algorithm.
587 std::unique_ptr<SampleProfileMatcher> MatchingManager;
588
589private:
590 const char *getAnnotatedRemarkPassName() const {
591 return AnnotatedPassName.c_str();
592 }
593};
594} // end anonymous namespace
595
596namespace llvm {
597template <>
599 return succ_empty(BB);
600}
601
602template <>
604 const std::vector<const BasicBlockT *> &BasicBlocks,
605 BlockEdgeMap &Successors, FlowFunction &Func) {
606 for (auto &Jump : Func.Jumps) {
607 const auto *BB = BasicBlocks[Jump.Source];
608 const auto *Succ = BasicBlocks[Jump.Target];
609 const Instruction *TI = BB->getTerminator();
610 // Check if a block ends with InvokeInst and mark non-taken branch unlikely.
611 // In that case block Succ should be a landing pad
612 if (Successors[BB].size() == 2 && Successors[BB].back() == Succ) {
613 if (isa<InvokeInst>(TI)) {
614 Jump.IsUnlikely = true;
615 }
616 }
617 const Instruction *SuccTI = Succ->getTerminator();
618 // Check if the target block contains UnreachableInst and mark it unlikely
619 if (SuccTI->getNumSuccessors() == 0) {
620 if (isa<UnreachableInst>(SuccTI)) {
621 Jump.IsUnlikely = true;
622 }
623 }
624 }
625}
626
627template <>
629 Function &F) {
630 DT.reset(new DominatorTree);
631 DT->recalculate(F);
632
633 PDT.reset(new PostDominatorTree(F));
634
635 LI.reset(new LoopInfo);
636 LI->analyze(*DT);
637}
638} // namespace llvm
639
640ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
642 return getProbeWeight(Inst);
643
644 const DebugLoc &DLoc = Inst.getDebugLoc();
645 if (!DLoc)
646 return std::error_code();
647
648 // Ignore all intrinsics, phinodes and branch instructions.
649 // Branch and phinodes instruction usually contains debug info from sources
650 // outside of the residing basic block, thus we ignore them during annotation.
651 if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
652 return std::error_code();
653
654 // For non-CS profile, if a direct call/invoke instruction is inlined in
655 // profile (findCalleeFunctionSamples returns non-empty result), but not
656 // inlined here, it means that the inlined callsite has no sample, thus the
657 // call instruction should have 0 count.
658 // For CS profile, the callsite count of previously inlined callees is
659 // populated with the entry count of the callees.
661 if (const auto *CB = dyn_cast<CallBase>(&Inst))
662 if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
663 return 0;
664
665 return getInstWeightImpl(Inst);
666}
667
668/// Get the FunctionSamples for a call instruction.
669///
670/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
671/// instance in which that call instruction is calling to. It contains
672/// all samples that resides in the inlined instance. We first find the
673/// inlined instance in which the call instruction is from, then we
674/// traverse its children to find the callsite with the matching
675/// location.
676///
677/// \param Inst Call/Invoke instruction to query.
678///
679/// \returns The FunctionSamples pointer to the inlined instance.
680const FunctionSamples *
681SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
682 const DILocation *DIL = Inst.getDebugLoc();
683 if (!DIL) {
684 return nullptr;
685 }
686
687 StringRef CalleeName;
688 if (Function *Callee = Inst.getCalledFunction())
689 CalleeName = Callee->getName();
690
692 return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
693
694 const FunctionSamples *FS = findFunctionSamples(Inst);
695 if (FS == nullptr)
696 return nullptr;
697
698 return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL),
699 CalleeName, Reader->getRemapper());
700}
701
702/// Returns a vector of FunctionSamples that are the indirect call targets
703/// of \p Inst. The vector is sorted by the total number of samples. Stores
704/// the total call count of the indirect call in \p Sum.
705std::vector<const FunctionSamples *>
706SampleProfileLoader::findIndirectCallFunctionSamples(
707 const Instruction &Inst, uint64_t &Sum) const {
708 const DILocation *DIL = Inst.getDebugLoc();
709 std::vector<const FunctionSamples *> R;
710
711 if (!DIL) {
712 return R;
713 }
714
715 auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
716 assert(L && R && "Expect non-null FunctionSamples");
717 if (L->getHeadSamplesEstimate() != R->getHeadSamplesEstimate())
718 return L->getHeadSamplesEstimate() > R->getHeadSamplesEstimate();
719 return L->getGUID() < R->getGUID();
720 };
721
723 auto CalleeSamples =
724 ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
725 if (CalleeSamples.empty())
726 return R;
727
728 // For CSSPGO, we only use target context profile's entry count
729 // as that already includes both inlined callee and non-inlined ones..
730 Sum = 0;
731 for (const auto *const FS : CalleeSamples) {
732 Sum += FS->getHeadSamplesEstimate();
733 R.push_back(FS);
734 }
735 llvm::sort(R, FSCompare);
736 return R;
737 }
738
739 const FunctionSamples *FS = findFunctionSamples(Inst);
740 if (FS == nullptr)
741 return R;
742
744 Sum = 0;
745 if (auto T = FS->findCallTargetMapAt(CallSite))
746 for (const auto &T_C : *T)
747 Sum += T_C.second;
748 if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
749 if (M->empty())
750 return R;
751 for (const auto &NameFS : *M) {
752 Sum += NameFS.second.getHeadSamplesEstimate();
753 R.push_back(&NameFS.second);
754 }
755 llvm::sort(R, FSCompare);
756 }
757 return R;
758}
759
760const FunctionSamples *
761SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
763 std::optional<PseudoProbe> Probe = extractProbe(Inst);
764 if (!Probe)
765 return nullptr;
766 }
767
768 const DILocation *DIL = Inst.getDebugLoc();
769 if (!DIL)
770 return Samples;
771
772 auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
773 if (it.second) {
775 it.first->second = ContextTracker->getContextSamplesFor(DIL);
776 else
777 it.first->second =
778 Samples->findFunctionSamples(DIL, Reader->getRemapper());
779 }
780 return it.first->second;
781}
782
783/// Check whether the indirect call promotion history of \p Inst allows
784/// the promotion for \p Candidate.
785/// If the profile count for the promotion candidate \p Candidate is
786/// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted
787/// for \p Inst. If we already have at least MaxNumPromotions
788/// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we
789/// cannot promote for \p Inst anymore.
790static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) {
791 uint32_t NumVals = 0;
792 uint64_t TotalCount = 0;
793 std::unique_ptr<InstrProfValueData[]> ValueData =
794 std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
795 bool Valid =
796 getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
797 ValueData.get(), NumVals, TotalCount, true);
798 // No valid value profile so no promoted targets have been recorded
799 // before. Ok to do ICP.
800 if (!Valid)
801 return true;
802
803 unsigned NumPromoted = 0;
804 for (uint32_t I = 0; I < NumVals; I++) {
805 if (ValueData[I].Count != NOMORE_ICP_MAGICNUM)
806 continue;
807
808 // If the promotion candidate has NOMORE_ICP_MAGICNUM count in the
809 // metadata, it means the candidate has been promoted for this
810 // indirect call.
811 if (ValueData[I].Value == Function::getGUID(Candidate))
812 return false;
813 NumPromoted++;
814 // If already have MaxNumPromotions promotion, don't do it anymore.
815 if (NumPromoted == MaxNumPromotions)
816 return false;
817 }
818 return true;
819}
820
821/// Update indirect call target profile metadata for \p Inst.
822/// Usually \p Sum is the sum of counts of all the targets for \p Inst.
823/// If it is 0, it means updateIDTMetaData is used to mark a
824/// certain target to be promoted already. If it is not zero,
825/// we expect to use it to update the total count in the value profile.
826static void
828 const SmallVectorImpl<InstrProfValueData> &CallTargets,
829 uint64_t Sum) {
830 // Bail out early if MaxNumPromotions is zero.
831 // This prevents allocating an array of zero length below.
832 //
833 // Note `updateIDTMetaData` is called in two places so check
834 // `MaxNumPromotions` inside it.
835 if (MaxNumPromotions == 0)
836 return;
837 uint32_t NumVals = 0;
838 // OldSum is the existing total count in the value profile data.
839 uint64_t OldSum = 0;
840 std::unique_ptr<InstrProfValueData[]> ValueData =
841 std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
842 bool Valid =
843 getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
844 ValueData.get(), NumVals, OldSum, true);
845
846 DenseMap<uint64_t, uint64_t> ValueCountMap;
847 if (Sum == 0) {
848 assert((CallTargets.size() == 1 &&
849 CallTargets[0].Count == NOMORE_ICP_MAGICNUM) &&
850 "If sum is 0, assume only one element in CallTargets "
851 "with count being NOMORE_ICP_MAGICNUM");
852 // Initialize ValueCountMap with existing value profile data.
853 if (Valid) {
854 for (uint32_t I = 0; I < NumVals; I++)
855 ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
856 }
857 auto Pair =
858 ValueCountMap.try_emplace(CallTargets[0].Value, CallTargets[0].Count);
859 // If the target already exists in value profile, decrease the total
860 // count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM.
861 if (!Pair.second) {
862 OldSum -= Pair.first->second;
863 Pair.first->second = NOMORE_ICP_MAGICNUM;
864 }
865 Sum = OldSum;
866 } else {
867 // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM
868 // counts in the value profile.
869 if (Valid) {
870 for (uint32_t I = 0; I < NumVals; I++) {
871 if (ValueData[I].Count == NOMORE_ICP_MAGICNUM)
872 ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
873 }
874 }
875
876 for (const auto &Data : CallTargets) {
877 auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count);
878 if (Pair.second)
879 continue;
880 // The target represented by Data.Value has already been promoted.
881 // Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease
882 // Sum by Data.Count.
883 assert(Sum >= Data.Count && "Sum should never be less than Data.Count");
884 Sum -= Data.Count;
885 }
886 }
887
889 for (const auto &ValueCount : ValueCountMap) {
890 NewCallTargets.emplace_back(
891 InstrProfValueData{ValueCount.first, ValueCount.second});
892 }
893
894 llvm::sort(NewCallTargets,
895 [](const InstrProfValueData &L, const InstrProfValueData &R) {
896 if (L.Count != R.Count)
897 return L.Count > R.Count;
898 return L.Value > R.Value;
899 });
900
901 uint32_t MaxMDCount =
902 std::min(NewCallTargets.size(), static_cast<size_t>(MaxNumPromotions));
904 NewCallTargets, Sum, IPVK_IndirectCallTarget, MaxMDCount);
905}
906
907/// Attempt to promote indirect call and also inline the promoted call.
908///
909/// \param F Caller function.
910/// \param Candidate ICP and inline candidate.
911/// \param SumOrigin Original sum of target counts for indirect call before
912/// promoting given candidate.
913/// \param Sum Prorated sum of remaining target counts for indirect call
914/// after promoting given candidate.
915/// \param InlinedCallSite Output vector for new call sites exposed after
916/// inlining.
917bool SampleProfileLoader::tryPromoteAndInlineCandidate(
918 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
919 SmallVector<CallBase *, 8> *InlinedCallSite) {
920 // Bail out early if sample-loader inliner is disabled.
922 return false;
923
924 // Bail out early if MaxNumPromotions is zero.
925 // This prevents allocating an array of zero length in callees below.
926 if (MaxNumPromotions == 0)
927 return false;
928 auto CalleeFunctionName = Candidate.CalleeSamples->getFunction();
929 auto R = SymbolMap.find(CalleeFunctionName);
930 if (R == SymbolMap.end() || !R->second)
931 return false;
932
933 auto &CI = *Candidate.CallInstr;
934 if (!doesHistoryAllowICP(CI, R->second->getName()))
935 return false;
936
937 const char *Reason = "Callee function not available";
938 // R->getValue() != &F is to prevent promoting a recursive call.
939 // If it is a recursive call, we do not inline it as it could bloat
940 // the code exponentially. There is way to better handle this, e.g.
941 // clone the caller first, and inline the cloned caller if it is
942 // recursive. As llvm does not inline recursive calls, we will
943 // simply ignore it instead of handling it explicitly.
944 if (!R->second->isDeclaration() && R->second->getSubprogram() &&
945 R->second->hasFnAttribute("use-sample-profile") &&
946 R->second != &F && isLegalToPromote(CI, R->second, &Reason)) {
947 // For promoted target, set its value with NOMORE_ICP_MAGICNUM count
948 // in the value profile metadata so the target won't be promoted again.
949 SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{
950 Function::getGUID(R->second->getName()), NOMORE_ICP_MAGICNUM}};
951 updateIDTMetaData(CI, SortedCallTargets, 0);
952
953 auto *DI = &pgo::promoteIndirectCall(
954 CI, R->second, Candidate.CallsiteCount, Sum, false, ORE);
955 if (DI) {
956 Sum -= Candidate.CallsiteCount;
957 // Do not prorate the indirect callsite distribution since the original
958 // distribution will be used to scale down non-promoted profile target
959 // counts later. By doing this we lose track of the real callsite count
960 // for the leftover indirect callsite as a trade off for accurate call
961 // target counts.
962 // TODO: Ideally we would have two separate factors, one for call site
963 // counts and one is used to prorate call target counts.
964 // Do not update the promoted direct callsite distribution at this
965 // point since the original distribution combined with the callee profile
966 // will be used to prorate callsites from the callee if inlined. Once not
967 // inlined, the direct callsite distribution should be prorated so that
968 // the it will reflect the real callsite counts.
969 Candidate.CallInstr = DI;
970 if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {
971 bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);
972 if (!Inlined) {
973 // Prorate the direct callsite distribution so that it reflects real
974 // callsite counts.
976 *DI, static_cast<float>(Candidate.CallsiteCount) / SumOrigin);
977 }
978 return Inlined;
979 }
980 }
981 } else {
982 LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
984 Candidate.CallInstr->getName())<< " because "
985 << Reason << "\n");
986 }
987 return false;
988}
989
990bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
992 return false;
993
995 if (Callee == nullptr)
996 return false;
997
999 GetAC, GetTLI);
1000
1001 if (Cost.isNever())
1002 return false;
1003
1004 if (Cost.isAlways())
1005 return true;
1006
1007 return Cost.getCost() <= SampleColdCallSiteThreshold;
1008}
1009
1010void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
1011 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
1012 bool Hot) {
1013 for (auto *I : Candidates) {
1014 Function *CalledFunction = I->getCalledFunction();
1015 if (CalledFunction) {
1016 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1017 "InlineAttempt", I->getDebugLoc(),
1018 I->getParent())
1019 << "previous inlining reattempted for "
1020 << (Hot ? "hotness: '" : "size: '")
1021 << ore::NV("Callee", CalledFunction) << "' into '"
1022 << ore::NV("Caller", &F) << "'");
1023 }
1024 }
1025}
1026
1027void SampleProfileLoader::findExternalInlineCandidate(
1028 CallBase *CB, const FunctionSamples *Samples,
1029 DenseSet<GlobalValue::GUID> &InlinedGUIDs, uint64_t Threshold) {
1030
1031 // If ExternalInlineAdvisor(ReplayInlineAdvisor) wants to inline an external
1032 // function make sure it's imported
1033 if (CB && getExternalInlineAdvisorShouldInline(*CB)) {
1034 // Samples may not exist for replayed function, if so
1035 // just add the direct GUID and move on
1036 if (!Samples) {
1037 InlinedGUIDs.insert(
1038 Function::getGUID(CB->getCalledFunction()->getName()));
1039 return;
1040 }
1041 // Otherwise, drop the threshold to import everything that we can
1042 Threshold = 0;
1043 }
1044
1045 // In some rare cases, call instruction could be changed after being pushed
1046 // into inline candidate queue, this is because earlier inlining may expose
1047 // constant propagation which can change indirect call to direct call. When
1048 // this happens, we may fail to find matching function samples for the
1049 // candidate later, even if a match was found when the candidate was enqueued.
1050 if (!Samples)
1051 return;
1052
1053 // For AutoFDO profile, retrieve candidate profiles by walking over
1054 // the nested inlinee profiles.
1056 // Set threshold to zero to honor pre-inliner decision.
1058 Threshold = 0;
1059 Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);
1060 return;
1061 }
1062
1063 ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(Samples);
1064 std::queue<ContextTrieNode *> CalleeList;
1065 CalleeList.push(Caller);
1066 while (!CalleeList.empty()) {
1067 ContextTrieNode *Node = CalleeList.front();
1068 CalleeList.pop();
1069 FunctionSamples *CalleeSample = Node->getFunctionSamples();
1070 // For CSSPGO profile, retrieve candidate profile by walking over the
1071 // trie built for context profile. Note that also take call targets
1072 // even if callee doesn't have a corresponding context profile.
1073 if (!CalleeSample)
1074 continue;
1075
1076 // If pre-inliner decision is used, honor that for importing as well.
1077 bool PreInline =
1080 if (!PreInline && CalleeSample->getHeadSamplesEstimate() < Threshold)
1081 continue;
1082
1083 Function *Func = SymbolMap.lookup(CalleeSample->getFunction());
1084 // Add to the import list only when it's defined out of module.
1085 if (!Func || Func->isDeclaration())
1086 InlinedGUIDs.insert(CalleeSample->getGUID());
1087
1088 // Import hot CallTargets, which may not be available in IR because full
1089 // profile annotation cannot be done until backend compilation in ThinLTO.
1090 for (const auto &BS : CalleeSample->getBodySamples())
1091 for (const auto &TS : BS.second.getCallTargets())
1092 if (TS.second > Threshold) {
1093 const Function *Callee = SymbolMap.lookup(TS.first);
1094 if (!Callee || Callee->isDeclaration())
1095 InlinedGUIDs.insert(TS.first.getHashCode());
1096 }
1097
1098 // Import hot child context profile associted with callees. Note that this
1099 // may have some overlap with the call target loop above, but doing this
1100 // based child context profile again effectively allow us to use the max of
1101 // entry count and call target count to determine importing.
1102 for (auto &Child : Node->getAllChildContext()) {
1103 ContextTrieNode *CalleeNode = &Child.second;
1104 CalleeList.push(CalleeNode);
1105 }
1106 }
1107}
1108
1109/// Iteratively inline hot callsites of a function.
1110///
1111/// Iteratively traverse all callsites of the function \p F, so as to
1112/// find out callsites with corresponding inline instances.
1113///
1114/// For such callsites,
1115/// - If it is hot enough, inline the callsites and adds callsites of the callee
1116/// into the caller. If the call is an indirect call, first promote
1117/// it to direct call. Each indirect call is limited with a single target.
1118///
1119/// - If a callsite is not inlined, merge the its profile to the outline
1120/// version (if --sample-profile-merge-inlinee is true), or scale the
1121/// counters of standalone function based on the profile of inlined
1122/// instances (if --sample-profile-merge-inlinee is false).
1123///
1124/// Later passes may consume the updated profiles.
1125///
1126/// \param F function to perform iterative inlining.
1127/// \param InlinedGUIDs a set to be updated to include all GUIDs that are
1128/// inlined in the profiled binary.
1129///
1130/// \returns True if there is any inline happened.
1131bool SampleProfileLoader::inlineHotFunctions(
1132 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1133 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1134 // Profile symbol list is ignored when profile-sample-accurate is on.
1135 assert((!ProfAccForSymsInList ||
1137 !F.hasFnAttribute("profile-sample-accurate"))) &&
1138 "ProfAccForSymsInList should be false when profile-sample-accurate "
1139 "is enabled");
1140
1141 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1142 bool Changed = false;
1143 bool LocalChanged = true;
1144 while (LocalChanged) {
1145 LocalChanged = false;
1147 for (auto &BB : F) {
1148 bool Hot = false;
1149 SmallVector<CallBase *, 10> AllCandidates;
1150 SmallVector<CallBase *, 10> ColdCandidates;
1151 for (auto &I : BB) {
1152 const FunctionSamples *FS = nullptr;
1153 if (auto *CB = dyn_cast<CallBase>(&I)) {
1154 if (!isa<IntrinsicInst>(I)) {
1155 if ((FS = findCalleeFunctionSamples(*CB))) {
1156 assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
1157 "GUIDToFuncNameMap has to be populated");
1158 AllCandidates.push_back(CB);
1159 if (FS->getHeadSamplesEstimate() > 0 ||
1161 LocalNotInlinedCallSites.insert({CB, FS});
1162 if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1163 Hot = true;
1164 else if (shouldInlineColdCallee(*CB))
1165 ColdCandidates.push_back(CB);
1166 } else if (getExternalInlineAdvisorShouldInline(*CB)) {
1167 AllCandidates.push_back(CB);
1168 }
1169 }
1170 }
1171 }
1172 if (Hot || ExternalInlineAdvisor) {
1173 CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
1174 emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
1175 } else {
1176 CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
1177 emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
1178 }
1179 }
1180 for (CallBase *I : CIS) {
1181 Function *CalledFunction = I->getCalledFunction();
1182 InlineCandidate Candidate = {I, LocalNotInlinedCallSites.lookup(I),
1183 0 /* dummy count */,
1184 1.0 /* dummy distribution factor */};
1185 // Do not inline recursive calls.
1186 if (CalledFunction == &F)
1187 continue;
1188 if (I->isIndirectCall()) {
1189 uint64_t Sum;
1190 for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
1191 uint64_t SumOrigin = Sum;
1192 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1193 findExternalInlineCandidate(I, FS, InlinedGUIDs,
1194 PSI->getOrCompHotCountThreshold());
1195 continue;
1196 }
1197 if (!callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1198 continue;
1199
1200 Candidate = {I, FS, FS->getHeadSamplesEstimate(), 1.0};
1201 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) {
1202 LocalNotInlinedCallSites.erase(I);
1203 LocalChanged = true;
1204 }
1205 }
1206 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1207 !CalledFunction->isDeclaration()) {
1208 if (tryInlineCandidate(Candidate)) {
1209 LocalNotInlinedCallSites.erase(I);
1210 LocalChanged = true;
1211 }
1212 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1213 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1214 InlinedGUIDs,
1215 PSI->getOrCompHotCountThreshold());
1216 }
1217 }
1218 Changed |= LocalChanged;
1219 }
1220
1221 // For CS profile, profile for not inlined context will be merged when
1222 // base profile is being retrieved.
1224 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1225 return Changed;
1226}
1227
1228bool SampleProfileLoader::tryInlineCandidate(
1229 InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
1230 // Do not attempt to inline a candidate if
1231 // --disable-sample-loader-inlining is true.
1233 return false;
1234
1235 CallBase &CB = *Candidate.CallInstr;
1236 Function *CalledFunction = CB.getCalledFunction();
1237 assert(CalledFunction && "Expect a callee with definition");
1238 DebugLoc DLoc = CB.getDebugLoc();
1239 BasicBlock *BB = CB.getParent();
1240
1241 InlineCost Cost = shouldInlineCandidate(Candidate);
1242 if (Cost.isNever()) {
1243 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1244 "InlineFail", DLoc, BB)
1245 << "incompatible inlining");
1246 return false;
1247 }
1248
1249 if (!Cost)
1250 return false;
1251
1252 InlineFunctionInfo IFI(GetAC);
1253 IFI.UpdateProfile = false;
1254 InlineResult IR = InlineFunction(CB, IFI,
1255 /*MergeAttributes=*/true);
1256 if (!IR.isSuccess())
1257 return false;
1258
1259 // The call to InlineFunction erases I, so we can't pass it here.
1260 emitInlinedIntoBasedOnCost(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(),
1261 Cost, true, getAnnotatedRemarkPassName());
1262
1263 // Now populate the list of newly exposed call sites.
1264 if (InlinedCallSites) {
1265 InlinedCallSites->clear();
1266 for (auto &I : IFI.InlinedCallSites)
1267 InlinedCallSites->push_back(I);
1268 }
1269
1271 ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
1272 ++NumCSInlined;
1273
1274 // Prorate inlined probes for a duplicated inlining callsite which probably
1275 // has a distribution less than 100%. Samples for an inlinee should be
1276 // distributed among the copies of the original callsite based on each
1277 // callsite's distribution factor for counts accuracy. Note that an inlined
1278 // probe may come with its own distribution factor if it has been duplicated
1279 // in the inlinee body. The two factor are multiplied to reflect the
1280 // aggregation of duplication.
1281 if (Candidate.CallsiteDistribution < 1) {
1282 for (auto &I : IFI.InlinedCallSites) {
1283 if (std::optional<PseudoProbe> Probe = extractProbe(*I))
1284 setProbeDistributionFactor(*I, Probe->Factor *
1285 Candidate.CallsiteDistribution);
1286 }
1287 NumDuplicatedInlinesite++;
1288 }
1289
1290 return true;
1291}
1292
1293bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
1294 CallBase *CB) {
1295 assert(CB && "Expect non-null call instruction");
1296
1297 if (isa<IntrinsicInst>(CB))
1298 return false;
1299
1300 // Find the callee's profile. For indirect call, find hottest target profile.
1301 const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
1302 // If ExternalInlineAdvisor wants to inline this site, do so even
1303 // if Samples are not present.
1304 if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(*CB))
1305 return false;
1306
1307 float Factor = 1.0;
1308 if (std::optional<PseudoProbe> Probe = extractProbe(*CB))
1309 Factor = Probe->Factor;
1310
1311 uint64_t CallsiteCount =
1312 CalleeSamples ? CalleeSamples->getHeadSamplesEstimate() * Factor : 0;
1313 *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
1314 return true;
1315}
1316
1317std::optional<InlineCost>
1318SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) {
1319 std::unique_ptr<InlineAdvice> Advice = nullptr;
1320 if (ExternalInlineAdvisor) {
1321 Advice = ExternalInlineAdvisor->getAdvice(CB);
1322 if (Advice) {
1323 if (!Advice->isInliningRecommended()) {
1324 Advice->recordUnattemptedInlining();
1325 return InlineCost::getNever("not previously inlined");
1326 }
1327 Advice->recordInlining();
1328 return InlineCost::getAlways("previously inlined");
1329 }
1330 }
1331
1332 return {};
1333}
1334
1335bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) {
1336 std::optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB);
1337 return Cost ? !!*Cost : false;
1338}
1339
1341SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
1342 if (std::optional<InlineCost> ReplayCost =
1343 getExternalInlineAdvisorCost(*Candidate.CallInstr))
1344 return *ReplayCost;
1345 // Adjust threshold based on call site hotness, only do this for callsite
1346 // prioritized inliner because otherwise cost-benefit check is done earlier.
1347 int SampleThreshold = SampleColdCallSiteThreshold;
1349 if (Candidate.CallsiteCount > PSI->getHotCountThreshold())
1350 SampleThreshold = SampleHotCallSiteThreshold;
1351 else if (!ProfileSizeInline)
1352 return InlineCost::getNever("cold callsite");
1353 }
1354
1355 Function *Callee = Candidate.CallInstr->getCalledFunction();
1356 assert(Callee && "Expect a definition for inline candidate of direct call");
1357
1358 InlineParams Params = getInlineParams();
1359 // We will ignore the threshold from inline cost, so always get full cost.
1360 Params.ComputeFullInlineCost = true;
1362 // Checks if there is anything in the reachable portion of the callee at
1363 // this callsite that makes this inlining potentially illegal. Need to
1364 // set ComputeFullInlineCost, otherwise getInlineCost may return early
1365 // when cost exceeds threshold without checking all IRs in the callee.
1366 // The acutal cost does not matter because we only checks isNever() to
1367 // see if it is legal to inline the callsite.
1368 InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
1369 GetTTI(*Callee), GetAC, GetTLI);
1370
1371 // Honor always inline and never inline from call analyzer
1372 if (Cost.isNever() || Cost.isAlways())
1373 return Cost;
1374
1375 // With CSSPGO, the preinliner in llvm-profgen can estimate global inline
1376 // decisions based on hotness as well as accurate function byte sizes for
1377 // given context using function/inlinee sizes from previous build. It
1378 // stores the decision in profile, and also adjust/merge context profile
1379 // aiming at better context-sensitive post-inline profile quality, assuming
1380 // all inline decision estimates are going to be honored by compiler. Here
1381 // we replay that inline decision under `sample-profile-use-preinliner`.
1382 // Note that we don't need to handle negative decision from preinliner as
1383 // context profile for not inlined calls are merged by preinliner already.
1384 if (UsePreInlinerDecision && Candidate.CalleeSamples) {
1385 // Once two node are merged due to promotion, we're losing some context
1386 // so the original context-sensitive preinliner decision should be ignored
1387 // for SyntheticContext.
1388 SampleContext &Context = Candidate.CalleeSamples->getContext();
1389 if (!Context.hasState(SyntheticContext) &&
1390 Context.hasAttribute(ContextShouldBeInlined))
1391 return InlineCost::getAlways("preinliner");
1392 }
1393
1394 // For old FDO inliner, we inline the call site as long as cost is not
1395 // "Never". The cost-benefit check is done earlier.
1397 return InlineCost::get(Cost.getCost(), INT_MAX);
1398 }
1399
1400 // Otherwise only use the cost from call analyzer, but overwite threshold with
1401 // Sample PGO threshold.
1402 return InlineCost::get(Cost.getCost(), SampleThreshold);
1403}
1404
1405bool SampleProfileLoader::inlineHotFunctionsWithPriority(
1406 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1407 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1408 // Profile symbol list is ignored when profile-sample-accurate is on.
1409 assert((!ProfAccForSymsInList ||
1411 !F.hasFnAttribute("profile-sample-accurate"))) &&
1412 "ProfAccForSymsInList should be false when profile-sample-accurate "
1413 "is enabled");
1414
1415 // Populating worklist with initial call sites from root inliner, along
1416 // with call site weights.
1417 CandidateQueue CQueue;
1418 InlineCandidate NewCandidate;
1419 for (auto &BB : F) {
1420 for (auto &I : BB) {
1421 auto *CB = dyn_cast<CallBase>(&I);
1422 if (!CB)
1423 continue;
1424 if (getInlineCandidate(&NewCandidate, CB))
1425 CQueue.push(NewCandidate);
1426 }
1427 }
1428
1429 // Cap the size growth from profile guided inlining. This is needed even
1430 // though cost of each inline candidate already accounts for callee size,
1431 // because with top-down inlining, we can grow inliner size significantly
1432 // with large number of smaller inlinees each pass the cost check.
1434 "Max inline size limit should not be smaller than min inline size "
1435 "limit.");
1436 unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;
1437 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
1438 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
1439 if (ExternalInlineAdvisor)
1440 SizeLimit = std::numeric_limits<unsigned>::max();
1441
1442 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1443
1444 // Perform iterative BFS call site prioritized inlining
1445 bool Changed = false;
1446 while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
1447 InlineCandidate Candidate = CQueue.top();
1448 CQueue.pop();
1449 CallBase *I = Candidate.CallInstr;
1450 Function *CalledFunction = I->getCalledFunction();
1451
1452 if (CalledFunction == &F)
1453 continue;
1454 if (I->isIndirectCall()) {
1455 uint64_t Sum = 0;
1456 auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
1457 uint64_t SumOrigin = Sum;
1458 Sum *= Candidate.CallsiteDistribution;
1459 unsigned ICPCount = 0;
1460 for (const auto *FS : CalleeSamples) {
1461 // TODO: Consider disable pre-lTO ICP for MonoLTO as well
1462 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1463 findExternalInlineCandidate(I, FS, InlinedGUIDs,
1464 PSI->getOrCompHotCountThreshold());
1465 continue;
1466 }
1467 uint64_t EntryCountDistributed =
1468 FS->getHeadSamplesEstimate() * Candidate.CallsiteDistribution;
1469 // In addition to regular inline cost check, we also need to make sure
1470 // ICP isn't introducing excessive speculative checks even if individual
1471 // target looks beneficial to promote and inline. That means we should
1472 // only do ICP when there's a small number dominant targets.
1473 if (ICPCount >= ProfileICPRelativeHotnessSkip &&
1474 EntryCountDistributed * 100 < SumOrigin * ProfileICPRelativeHotness)
1475 break;
1476 // TODO: Fix CallAnalyzer to handle all indirect calls.
1477 // For indirect call, we don't run CallAnalyzer to get InlineCost
1478 // before actual inlining. This is because we could see two different
1479 // types from the same definition, which makes CallAnalyzer choke as
1480 // it's expecting matching parameter type on both caller and callee
1481 // side. See example from PR18962 for the triggering cases (the bug was
1482 // fixed, but we generate different types).
1483 if (!PSI->isHotCount(EntryCountDistributed))
1484 break;
1485 SmallVector<CallBase *, 8> InlinedCallSites;
1486 // Attach function profile for promoted indirect callee, and update
1487 // call site count for the promoted inline candidate too.
1488 Candidate = {I, FS, EntryCountDistributed,
1489 Candidate.CallsiteDistribution};
1490 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
1491 &InlinedCallSites)) {
1492 for (auto *CB : InlinedCallSites) {
1493 if (getInlineCandidate(&NewCandidate, CB))
1494 CQueue.emplace(NewCandidate);
1495 }
1496 ICPCount++;
1497 Changed = true;
1498 } else if (!ContextTracker) {
1499 LocalNotInlinedCallSites.insert({I, FS});
1500 }
1501 }
1502 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1503 !CalledFunction->isDeclaration()) {
1504 SmallVector<CallBase *, 8> InlinedCallSites;
1505 if (tryInlineCandidate(Candidate, &InlinedCallSites)) {
1506 for (auto *CB : InlinedCallSites) {
1507 if (getInlineCandidate(&NewCandidate, CB))
1508 CQueue.emplace(NewCandidate);
1509 }
1510 Changed = true;
1511 } else if (!ContextTracker) {
1512 LocalNotInlinedCallSites.insert({I, Candidate.CalleeSamples});
1513 }
1514 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1515 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1516 InlinedGUIDs,
1517 PSI->getOrCompHotCountThreshold());
1518 }
1519 }
1520
1521 if (!CQueue.empty()) {
1522 if (SizeLimit == (unsigned)ProfileInlineLimitMax)
1523 ++NumCSInlinedHitMaxLimit;
1524 else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
1525 ++NumCSInlinedHitMinLimit;
1526 else
1527 ++NumCSInlinedHitGrowthLimit;
1528 }
1529
1530 // For CS profile, profile for not inlined context will be merged when
1531 // base profile is being retrieved.
1533 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1534 return Changed;
1535}
1536
1537void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
1539 const Function &F) {
1540 // Accumulate not inlined callsite information into notInlinedSamples
1541 for (const auto &Pair : NonInlinedCallSites) {
1542 CallBase *I = Pair.first;
1543 Function *Callee = I->getCalledFunction();
1544 if (!Callee || Callee->isDeclaration())
1545 continue;
1546
1547 ORE->emit(
1548 OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), "NotInline",
1549 I->getDebugLoc(), I->getParent())
1550 << "previous inlining not repeated: '" << ore::NV("Callee", Callee)
1551 << "' into '" << ore::NV("Caller", &F) << "'");
1552
1553 ++NumCSNotInlined;
1554 const FunctionSamples *FS = Pair.second;
1555 if (FS->getTotalSamples() == 0 && FS->getHeadSamplesEstimate() == 0) {
1556 continue;
1557 }
1558
1559 // Do not merge a context that is already duplicated into the base profile.
1560 if (FS->getContext().hasAttribute(sampleprof::ContextDuplicatedIntoBase))
1561 continue;
1562
1563 if (ProfileMergeInlinee) {
1564 // A function call can be replicated by optimizations like callsite
1565 // splitting or jump threading and the replicates end up sharing the
1566 // sample nested callee profile instead of slicing the original
1567 // inlinee's profile. We want to do merge exactly once by filtering out
1568 // callee profiles with a non-zero head sample count.
1569 if (FS->getHeadSamples() == 0) {
1570 // Use entry samples as head samples during the merge, as inlinees
1571 // don't have head samples.
1572 const_cast<FunctionSamples *>(FS)->addHeadSamples(
1573 FS->getHeadSamplesEstimate());
1574
1575 // Note that we have to do the merge right after processing function.
1576 // This allows OutlineFS's profile to be used for annotation during
1577 // top-down processing of functions' annotation.
1578 FunctionSamples *OutlineFS = Reader->getSamplesFor(*Callee);
1579 // If outlined function does not exist in the profile, add it to a
1580 // separate map so that it does not rehash the original profile.
1581 if (!OutlineFS)
1582 OutlineFS = &OutlineFunctionSamples[
1584 OutlineFS->merge(*FS, 1);
1585 // Set outlined profile to be synthetic to not bias the inliner.
1586 OutlineFS->SetContextSynthetic();
1587 }
1588 } else {
1589 auto pair =
1590 notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
1591 pair.first->second.entryCount += FS->getHeadSamplesEstimate();
1592 }
1593 }
1594}
1595
1596/// Returns the sorted CallTargetMap \p M by count in descending order.
1600 for (const auto &I : SampleRecord::SortCallTargets(M)) {
1601 R.emplace_back(
1602 InstrProfValueData{I.first.getHashCode(), I.second});
1603 }
1604 return R;
1605}
1606
1607// Generate MD_prof metadata for every branch instruction using the
1608// edge weights computed during propagation.
1609void SampleProfileLoader::generateMDProfMetadata(Function &F) {
1610 // Generate MD_prof metadata for every branch instruction using the
1611 // edge weights computed during propagation.
1612 LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
1613 LLVMContext &Ctx = F.getContext();
1614 MDBuilder MDB(Ctx);
1615 for (auto &BI : F) {
1616 BasicBlock *BB = &BI;
1617
1618 if (BlockWeights[BB]) {
1619 for (auto &I : *BB) {
1620 if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
1621 continue;
1622 if (!cast<CallBase>(I).getCalledFunction()) {
1623 const DebugLoc &DLoc = I.getDebugLoc();
1624 if (!DLoc)
1625 continue;
1626 const DILocation *DIL = DLoc;
1627 const FunctionSamples *FS = findFunctionSamples(I);
1628 if (!FS)
1629 continue;
1632 FS->findCallTargetMapAt(CallSite);
1633 if (!T || T.get().empty())
1634 continue;
1636 // Prorate the callsite counts based on the pre-ICP distribution
1637 // factor to reflect what is already done to the callsite before
1638 // ICP, such as calliste cloning.
1639 if (std::optional<PseudoProbe> Probe = extractProbe(I)) {
1640 if (Probe->Factor < 1)
1641 T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
1642 }
1643 }
1644 SmallVector<InstrProfValueData, 2> SortedCallTargets =
1646 uint64_t Sum = 0;
1647 for (const auto &C : T.get())
1648 Sum += C.second;
1649 // With CSSPGO all indirect call targets are counted torwards the
1650 // original indirect call site in the profile, including both
1651 // inlined and non-inlined targets.
1653 if (const FunctionSamplesMap *M =
1654 FS->findFunctionSamplesMapAt(CallSite)) {
1655 for (const auto &NameFS : *M)
1656 Sum += NameFS.second.getHeadSamplesEstimate();
1657 }
1658 }
1659 if (Sum)
1660 updateIDTMetaData(I, SortedCallTargets, Sum);
1661 else if (OverwriteExistingWeights)
1662 I.setMetadata(LLVMContext::MD_prof, nullptr);
1663 } else if (!isa<IntrinsicInst>(&I)) {
1664 setBranchWeights(I, {static_cast<uint32_t>(BlockWeights[BB])});
1665 }
1666 }
1668 // Set profile metadata (possibly annotated by LTO prelink) to zero or
1669 // clear it for cold code.
1670 for (auto &I : *BB) {
1671 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1672 if (cast<CallBase>(I).isIndirectCall()) {
1673 I.setMetadata(LLVMContext::MD_prof, nullptr);
1674 } else {
1676 }
1677 }
1678 }
1679 }
1680
1681 Instruction *TI = BB->getTerminator();
1682 if (TI->getNumSuccessors() == 1)
1683 continue;
1684 if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI) &&
1685 !isa<IndirectBrInst>(TI))
1686 continue;
1687
1688 DebugLoc BranchLoc = TI->getDebugLoc();
1689 LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
1690 << ((BranchLoc) ? Twine(BranchLoc.getLine())
1691 : Twine("<UNKNOWN LOCATION>"))
1692 << ".\n");
1694 uint32_t MaxWeight = 0;
1695 Instruction *MaxDestInst;
1696 // Since profi treats multiple edges (multiway branches) as a single edge,
1697 // we need to distribute the computed weight among the branches. We do
1698 // this by evenly splitting the edge weight among destinations.
1700 std::vector<uint64_t> EdgeIndex;
1702 EdgeIndex.resize(TI->getNumSuccessors());
1703 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1704 const BasicBlock *Succ = TI->getSuccessor(I);
1705 EdgeIndex[I] = EdgeMultiplicity[Succ];
1706 EdgeMultiplicity[Succ]++;
1707 }
1708 }
1709 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1710 BasicBlock *Succ = TI->getSuccessor(I);
1711 Edge E = std::make_pair(BB, Succ);
1712 uint64_t Weight = EdgeWeights[E];
1713 LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
1714 // Use uint32_t saturated arithmetic to adjust the incoming weights,
1715 // if needed. Sample counts in profiles are 64-bit unsigned values,
1716 // but internally branch weights are expressed as 32-bit values.
1717 if (Weight > std::numeric_limits<uint32_t>::max()) {
1718 LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
1719 Weight = std::numeric_limits<uint32_t>::max();
1720 }
1721 if (!SampleProfileUseProfi) {
1722 // Weight is added by one to avoid propagation errors introduced by
1723 // 0 weights.
1724 Weights.push_back(static_cast<uint32_t>(Weight + 1));
1725 } else {
1726 // Profi creates proper weights that do not require "+1" adjustments but
1727 // we evenly split the weight among branches with the same destination.
1728 uint64_t W = Weight / EdgeMultiplicity[Succ];
1729 // Rounding up, if needed, so that first branches are hotter.
1730 if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ])
1731 W++;
1732 Weights.push_back(static_cast<uint32_t>(W));
1733 }
1734 if (Weight != 0) {
1735 if (Weight > MaxWeight) {
1736 MaxWeight = Weight;
1737 MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
1738 }
1739 }
1740 }
1741
1742 misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);
1743
1744 uint64_t TempWeight;
1745 // Only set weights if there is at least one non-zero weight.
1746 // In any other case, let the analyzer set weights.
1747 // Do not set weights if the weights are present unless under
1748 // OverwriteExistingWeights. In ThinLTO, the profile annotation is done
1749 // twice. If the first annotation already set the weights, the second pass
1750 // does not need to set it. With OverwriteExistingWeights, Blocks with zero
1751 // weight should have their existing metadata (possibly annotated by LTO
1752 // prelink) cleared.
1753 if (MaxWeight > 0 &&
1754 (!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) {
1755 LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
1756 setBranchWeights(*TI, Weights);
1757 ORE->emit([&]() {
1758 return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
1759 << "most popular destination for conditional branches at "
1760 << ore::NV("CondBranchesLoc", BranchLoc);
1761 });
1762 } else {
1764 TI->setMetadata(LLVMContext::MD_prof, nullptr);
1765 LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n");
1766 } else {
1767 LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
1768 }
1769 }
1770 }
1771}
1772
1773/// Once all the branch weights are computed, we emit the MD_prof
1774/// metadata on BB using the computed values for each of its branches.
1775///
1776/// \param F The function to query.
1777///
1778/// \returns true if \p F was modified. Returns false, otherwise.
1779bool SampleProfileLoader::emitAnnotations(Function &F) {
1780 bool Changed = false;
1781
1783 LLVM_DEBUG({
1784 if (!ProbeManager->getDesc(F))
1785 dbgs() << "Probe descriptor missing for Function " << F.getName()
1786 << "\n";
1787 });
1788
1789 if (ProbeManager->profileIsValid(F, *Samples)) {
1790 ++NumMatchedProfile;
1791 } else {
1792 ++NumMismatchedProfile;
1793 LLVM_DEBUG(
1794 dbgs() << "Profile is invalid due to CFG mismatch for Function "
1795 << F.getName() << "\n");
1797 return false;
1798 }
1799 } else {
1800 if (getFunctionLoc(F) == 0)
1801 return false;
1802
1803 LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
1804 << F.getName() << ": " << getFunctionLoc(F) << "\n");
1805 }
1806
1807 DenseSet<GlobalValue::GUID> InlinedGUIDs;
1809 Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
1810 else
1811 Changed |= inlineHotFunctions(F, InlinedGUIDs);
1812
1813 Changed |= computeAndPropagateWeights(F, InlinedGUIDs);
1814
1815 if (Changed)
1816 generateMDProfMetadata(F);
1817
1818 emitCoverageRemarks(F);
1819 return Changed;
1820}
1821
1822std::unique_ptr<ProfiledCallGraph>
1823SampleProfileLoader::buildProfiledCallGraph(Module &M) {
1824 std::unique_ptr<ProfiledCallGraph> ProfiledCG;
1826 ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker);
1827 else
1828 ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles());
1829
1830 // Add all functions into the profiled call graph even if they are not in
1831 // the profile. This makes sure functions missing from the profile still
1832 // gets a chance to be processed.
1833 for (Function &F : M) {
1835 continue;
1836 ProfiledCG->addProfiledFunction(
1838 }
1839
1840 return ProfiledCG;
1841}
1842
1843std::vector<Function *>
1844SampleProfileLoader::buildFunctionOrder(Module &M, LazyCallGraph &CG) {
1845 std::vector<Function *> FunctionOrderList;
1846 FunctionOrderList.reserve(M.size());
1847
1849 errs() << "WARNING: -use-profiled-call-graph ignored, should be used "
1850 "together with -sample-profile-top-down-load.\n";
1851
1852 if (!ProfileTopDownLoad) {
1853 if (ProfileMergeInlinee) {
1854 // Disable ProfileMergeInlinee if profile is not loaded in top down order,
1855 // because the profile for a function may be used for the profile
1856 // annotation of its outline copy before the profile merging of its
1857 // non-inlined inline instances, and that is not the way how
1858 // ProfileMergeInlinee is supposed to work.
1859 ProfileMergeInlinee = false;
1860 }
1861
1862 for (Function &F : M)
1864 FunctionOrderList.push_back(&F);
1865 return FunctionOrderList;
1866 }
1867
1869 !UseProfiledCallGraph.getNumOccurrences())) {
1870 // Use profiled call edges to augment the top-down order. There are cases
1871 // that the top-down order computed based on the static call graph doesn't
1872 // reflect real execution order. For example
1873 //
1874 // 1. Incomplete static call graph due to unknown indirect call targets.
1875 // Adjusting the order by considering indirect call edges from the
1876 // profile can enable the inlining of indirect call targets by allowing
1877 // the caller processed before them.
1878 // 2. Mutual call edges in an SCC. The static processing order computed for
1879 // an SCC may not reflect the call contexts in the context-sensitive
1880 // profile, thus may cause potential inlining to be overlooked. The
1881 // function order in one SCC is being adjusted to a top-down order based
1882 // on the profile to favor more inlining. This is only a problem with CS
1883 // profile.
1884 // 3. Transitive indirect call edges due to inlining. When a callee function
1885 // (say B) is inlined into a caller function (say A) in LTO prelink,
1886 // every call edge originated from the callee B will be transferred to
1887 // the caller A. If any transferred edge (say A->C) is indirect, the
1888 // original profiled indirect edge B->C, even if considered, would not
1889 // enforce a top-down order from the caller A to the potential indirect
1890 // call target C in LTO postlink since the inlined callee B is gone from
1891 // the static call graph.
1892 // 4. #3 can happen even for direct call targets, due to functions defined
1893 // in header files. A header function (say A), when included into source
1894 // files, is defined multiple times but only one definition survives due
1895 // to ODR. Therefore, the LTO prelink inlining done on those dropped
1896 // definitions can be useless based on a local file scope. More
1897 // importantly, the inlinee (say B), once fully inlined to a
1898 // to-be-dropped A, will have no profile to consume when its outlined
1899 // version is compiled. This can lead to a profile-less prelink
1900 // compilation for the outlined version of B which may be called from
1901 // external modules. while this isn't easy to fix, we rely on the
1902 // postlink AutoFDO pipeline to optimize B. Since the survived copy of
1903 // the A can be inlined in its local scope in prelink, it may not exist
1904 // in the merged IR in postlink, and we'll need the profiled call edges
1905 // to enforce a top-down order for the rest of the functions.
1906 //
1907 // Considering those cases, a profiled call graph completely independent of
1908 // the static call graph is constructed based on profile data, where
1909 // function objects are not even needed to handle case #3 and case 4.
1910 //
1911 // Note that static callgraph edges are completely ignored since they
1912 // can be conflicting with profiled edges for cyclic SCCs and may result in
1913 // an SCC order incompatible with profile-defined one. Using strictly
1914 // profile order ensures a maximum inlining experience. On the other hand,
1915 // static call edges are not so important when they don't correspond to a
1916 // context in the profile.
1917
1918 std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(M);
1919 scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());
1920 while (!CGI.isAtEnd()) {
1921 auto Range = *CGI;
1922 if (SortProfiledSCC) {
1923 // Sort nodes in one SCC based on callsite hotness.
1925 Range = *SI;
1926 }
1927 for (auto *Node : Range) {
1928 Function *F = SymbolMap.lookup(Node->Name);
1929 if (F && !skipProfileForFunction(*F))
1930 FunctionOrderList.push_back(F);
1931 }
1932 ++CGI;
1933 }
1934 } else {
1935 CG.buildRefSCCs();
1936 for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) {
1937 for (LazyCallGraph::SCC &C : RC) {
1938 for (LazyCallGraph::Node &N : C) {
1939 Function &F = N.getFunction();
1941 FunctionOrderList.push_back(&F);
1942 }
1943 }
1944 }
1945 }
1946
1947 std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
1948
1949 LLVM_DEBUG({
1950 dbgs() << "Function processing order:\n";
1951 for (auto F : FunctionOrderList) {
1952 dbgs() << F->getName() << "\n";
1953 }
1954 });
1955
1956 return FunctionOrderList;
1957}
1958
1959bool SampleProfileLoader::doInitialization(Module &M,
1961 auto &Ctx = M.getContext();
1962
1963 auto ReaderOrErr = SampleProfileReader::create(
1964 Filename, Ctx, *FS, FSDiscriminatorPass::Base, RemappingFilename);
1965 if (std::error_code EC = ReaderOrErr.getError()) {
1966 std::string Msg = "Could not open profile: " + EC.message();
1967 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1968 return false;
1969 }
1970 Reader = std::move(ReaderOrErr.get());
1971 Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink);
1972 // set module before reading the profile so reader may be able to only
1973 // read the function profiles which are used by the current module.
1974 Reader->setModule(&M);
1975 if (std::error_code EC = Reader->read()) {
1976 std::string Msg = "profile reading failed: " + EC.message();
1977 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1978 return false;
1979 }
1980
1981 PSL = Reader->getProfileSymbolList();
1982
1983 // While profile-sample-accurate is on, ignore symbol list.
1984 ProfAccForSymsInList =
1986 if (ProfAccForSymsInList) {
1987 NamesInProfile.clear();
1988 GUIDsInProfile.clear();
1989 if (auto NameTable = Reader->getNameTable()) {
1991 for (auto Name : *NameTable)
1992 GUIDsInProfile.insert(Name.getHashCode());
1993 } else {
1994 for (auto Name : *NameTable)
1995 NamesInProfile.insert(Name.stringRef());
1996 }
1997 }
1998 CoverageTracker.setProfAccForSymsInList(true);
1999 }
2000
2001 if (FAM && !ProfileInlineReplayFile.empty()) {
2002 ExternalInlineAdvisor = getReplayInlineAdvisor(
2003 M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr,
2008 /*EmitRemarks=*/false, InlineContext{LTOPhase, InlinePass::ReplaySampleProfileInliner});
2009 }
2010
2011 // Apply tweaks if context-sensitive or probe-based profile is available.
2012 if (Reader->profileIsCS() || Reader->profileIsPreInlined() ||
2013 Reader->profileIsProbeBased()) {
2014 if (!UseIterativeBFIInference.getNumOccurrences())
2016 if (!SampleProfileUseProfi.getNumOccurrences())
2017 SampleProfileUseProfi = true;
2018 if (!EnableExtTspBlockPlacement.getNumOccurrences())
2020 // Enable priority-base inliner and size inline by default for CSSPGO.
2021 if (!ProfileSizeInline.getNumOccurrences())
2022 ProfileSizeInline = true;
2023 if (!CallsitePrioritizedInline.getNumOccurrences())
2025 // For CSSPGO, we also allow recursive inline to best use context profile.
2026 if (!AllowRecursiveInline.getNumOccurrences())
2027 AllowRecursiveInline = true;
2028
2029 if (Reader->profileIsPreInlined()) {
2030 if (!UsePreInlinerDecision.getNumOccurrences())
2031 UsePreInlinerDecision = true;
2032 }
2033
2034 // Enable stale profile matching by default for probe-based profile.
2035 // Currently the matching relies on if the checksum mismatch is detected,
2036 // which is currently only available for pseudo-probe mode. Removing the
2037 // checksum check could cause regressions for some cases, so further tuning
2038 // might be needed if we want to enable it for all cases.
2039 if (Reader->profileIsProbeBased() &&
2040 !SalvageStaleProfile.getNumOccurrences()) {
2041 SalvageStaleProfile = true;
2042 }
2043
2044 if (!Reader->profileIsCS()) {
2045 // Non-CS profile should be fine without a function size budget for the
2046 // inliner since the contexts in the profile are either all from inlining
2047 // in the prevoius build or pre-computed by the preinliner with a size
2048 // cap, thus they are bounded.
2049 if (!ProfileInlineLimitMin.getNumOccurrences())
2050 ProfileInlineLimitMin = std::numeric_limits<unsigned>::max();
2051 if (!ProfileInlineLimitMax.getNumOccurrences())
2052 ProfileInlineLimitMax = std::numeric_limits<unsigned>::max();
2053 }
2054 }
2055
2056 if (Reader->profileIsCS()) {
2057 // Tracker for profiles under different context
2058 ContextTracker = std::make_unique<SampleContextTracker>(
2059 Reader->getProfiles(), &GUIDToFuncNameMap);
2060 }
2061
2062 // Load pseudo probe descriptors for probe-based function samples.
2063 if (Reader->profileIsProbeBased()) {
2064 ProbeManager = std::make_unique<PseudoProbeManager>(M);
2065 if (!ProbeManager->moduleIsProbed(M)) {
2066 const char *Msg =
2067 "Pseudo-probe-based profile requires SampleProfileProbePass";
2068 Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg,
2069 DS_Warning));
2070 return false;
2071 }
2072 }
2073
2076 MatchingManager = std::make_unique<SampleProfileMatcher>(
2077 M, *Reader, ProbeManager.get(), LTOPhase);
2078 }
2079
2080 return true;
2081}
2082
2083// Note that this is a module-level check. Even if one module is errored out,
2084// the entire build will be errored out. However, the user could make big
2085// changes to functions in single module but those changes might not be
2086// performance significant to the whole binary. Therefore, to avoid those false
2087// positives, we select a reasonable big set of hot functions that are supposed
2088// to be globally performance significant, only compute and check the mismatch
2089// within those functions. The function selection is based on two criteria:
2090// 1) The function is hot enough, which is tuned by a hotness-based
2091// flag(HotFuncCutoffForStalenessError). 2) The num of function is large enough
2092// which is tuned by the MinfuncsForStalenessError flag.
2093bool SampleProfileLoader::rejectHighStalenessProfile(
2094 Module &M, ProfileSummaryInfo *PSI, const SampleProfileMap &Profiles) {
2096 "Only support for probe-based profile");
2097 uint64_t TotalHotFunc = 0;
2098 uint64_t NumMismatchedFunc = 0;
2099 for (const auto &I : Profiles) {
2100 const auto &FS = I.second;
2101 const auto *FuncDesc = ProbeManager->getDesc(FS.getGUID());
2102 if (!FuncDesc)
2103 continue;
2104
2105 // Use a hotness-based threshold to control the function selection.
2107 FS.getTotalSamples()))
2108 continue;
2109
2110 TotalHotFunc++;
2111 if (ProbeManager->profileIsHashMismatched(*FuncDesc, FS))
2112 NumMismatchedFunc++;
2113 }
2114 // Make sure that the num of selected function is not too small to distinguish
2115 // from the user's benign changes.
2116 if (TotalHotFunc < MinfuncsForStalenessError)
2117 return false;
2118
2119 // Finally check the mismatch percentage against the threshold.
2120 if (NumMismatchedFunc * 100 >=
2121 TotalHotFunc * PrecentMismatchForStalenessError) {
2122 auto &Ctx = M.getContext();
2123 const char *Msg =
2124 "The input profile significantly mismatches current source code. "
2125 "Please recollect profile to avoid performance regression.";
2126 Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg));
2127 return true;
2128 }
2129 return false;
2130}
2131
2132void SampleProfileLoader::removePseudoProbeInsts(Module &M) {
2133 for (auto &F : M) {
2134 std::vector<Instruction *> InstsToDel;
2135 for (auto &BB : F) {
2136 for (auto &I : BB) {
2137 if (isa<PseudoProbeInst>(&I))
2138 InstsToDel.push_back(&I);
2139 }
2140 }
2141 for (auto *I : InstsToDel)
2142 I->eraseFromParent();
2143 }
2144}
2145
2146bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
2147 ProfileSummaryInfo *_PSI,
2148 LazyCallGraph &CG) {
2149 GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
2150
2151 PSI = _PSI;
2152 if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
2153 M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
2155 PSI->refresh();
2156 }
2157
2159 rejectHighStalenessProfile(M, PSI, Reader->getProfiles()))
2160 return false;
2161
2162 // Compute the total number of samples collected in this profile.
2163 for (const auto &I : Reader->getProfiles())
2164 TotalCollectedSamples += I.second.getTotalSamples();
2165
2166 auto Remapper = Reader->getRemapper();
2167 // Populate the symbol map.
2168 for (const auto &N_F : M.getValueSymbolTable()) {
2169 StringRef OrigName = N_F.getKey();
2170 Function *F = dyn_cast<Function>(N_F.getValue());
2171 if (F == nullptr || OrigName.empty())
2172 continue;
2173 SymbolMap[FunctionId(OrigName)] = F;
2175 if (OrigName != NewName && !NewName.empty()) {
2176 auto r = SymbolMap.emplace(FunctionId(NewName), F);
2177 // Failiing to insert means there is already an entry in SymbolMap,
2178 // thus there are multiple functions that are mapped to the same
2179 // stripped name. In this case of name conflicting, set the value
2180 // to nullptr to avoid confusion.
2181 if (!r.second)
2182 r.first->second = nullptr;
2183 OrigName = NewName;
2184 }
2185 // Insert the remapped names into SymbolMap.
2186 if (Remapper) {
2187 if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
2188 if (*MapName != OrigName && !MapName->empty())
2189 SymbolMap.emplace(FunctionId(*MapName), F);
2190 }
2191 }
2192 }
2193 assert(SymbolMap.count(FunctionId()) == 0 &&
2194 "No empty StringRef should be added in SymbolMap");
2195
2198 MatchingManager->runOnModule();
2199 MatchingManager->clearMatchingData();
2200 }
2201
2202 bool retval = false;
2203 for (auto *F : buildFunctionOrder(M, CG)) {
2204 assert(!F->isDeclaration());
2205 clearFunctionData();
2206 retval |= runOnFunction(*F, AM);
2207 }
2208
2209 // Account for cold calls not inlined....
2211 for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
2212 notInlinedCallInfo)
2213 updateProfileCallee(pair.first, pair.second.entryCount);
2214
2216 removePseudoProbeInsts(M);
2217
2218 return retval;
2219}
2220
2221bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
2222 LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
2223 DILocation2SampleMap.clear();
2224 // By default the entry count is initialized to -1, which will be treated
2225 // conservatively by getEntryCount as the same as unknown (None). This is
2226 // to avoid newly added code to be treated as cold. If we have samples
2227 // this will be overwritten in emitAnnotations.
2228 uint64_t initialEntryCount = -1;
2229
2230 ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
2231 if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
2232 // initialize all the function entry counts to 0. It means all the
2233 // functions without profile will be regarded as cold.
2234 initialEntryCount = 0;
2235 // profile-sample-accurate is a user assertion which has a higher precedence
2236 // than symbol list. When profile-sample-accurate is on, ignore symbol list.
2237 ProfAccForSymsInList = false;
2238 }
2239 CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList);
2240
2241 // PSL -- profile symbol list include all the symbols in sampled binary.
2242 // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
2243 // old functions without samples being cold, without having to worry
2244 // about new and hot functions being mistakenly treated as cold.
2245 if (ProfAccForSymsInList) {
2246 // Initialize the entry count to 0 for functions in the list.
2247 if (PSL->contains(F.getName()))
2248 initialEntryCount = 0;
2249
2250 // Function in the symbol list but without sample will be regarded as
2251 // cold. To minimize the potential negative performance impact it could
2252 // have, we want to be a little conservative here saying if a function
2253 // shows up in the profile, no matter as outline function, inline instance
2254 // or call targets, treat the function as not being cold. This will handle
2255 // the cases such as most callsites of a function are inlined in sampled
2256 // binary but not inlined in current build (because of source code drift,
2257 // imprecise debug information, or the callsites are all cold individually
2258 // but not cold accumulatively...), so the outline function showing up as
2259 // cold in sampled binary will actually not be cold after current build.
2262 GUIDsInProfile.count(Function::getGUID(CanonName))) ||
2263 (!FunctionSamples::UseMD5 && NamesInProfile.count(CanonName)))
2264 initialEntryCount = -1;
2265 }
2266
2267 // Initialize entry count when the function has no existing entry
2268 // count value.
2269 if (!F.getEntryCount())
2270 F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
2271 std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
2272 if (AM) {
2273 auto &FAM =
2275 .getManager();
2277 } else {
2278 OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
2279 ORE = OwnedORE.get();
2280 }
2281
2283 Samples = ContextTracker->getBaseSamplesFor(F);
2284 else {
2285 Samples = Reader->getSamplesFor(F);
2286 // Try search in previously inlined functions that were split or duplicated
2287 // into base.
2288 if (!Samples) {
2290 auto It = OutlineFunctionSamples.find(FunctionId(CanonName));
2291 if (It != OutlineFunctionSamples.end()) {
2292 Samples = &It->second;
2293 } else if (auto Remapper = Reader->getRemapper()) {
2294 if (auto RemppedName = Remapper->lookUpNameInProfile(CanonName)) {
2295 It = OutlineFunctionSamples.find(FunctionId(*RemppedName));
2296 if (It != OutlineFunctionSamples.end())
2297 Samples = &It->second;
2298 }
2299 }
2300 }
2301 }
2302
2303 if (Samples && !Samples->empty())
2304 return emitAnnotations(F);
2305 return false;
2306}
2308 std::string File, std::string RemappingFile, ThinOrFullLTOPhase LTOPhase,
2310 : ProfileFileName(File), ProfileRemappingFileName(RemappingFile),
2311 LTOPhase(LTOPhase), FS(std::move(FS)) {}
2312
2317
2318 auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
2320 };
2321 auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
2323 };
2324 auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
2326 };
2327
2328 if (!FS)
2330
2331 SampleProfileLoader SampleLoader(
2332 ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
2333 ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
2334 : ProfileRemappingFileName,
2335 LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI);
2336
2337 if (!SampleLoader.doInitialization(M, &FAM))
2338 return PreservedAnalyses::all();
2339
2342 if (!SampleLoader.runOnModule(M, &AM, PSI, CG))
2343 return PreservedAnalyses::all();
2344
2345 return PreservedAnalyses::none();
2346}
This file defines the StringMap class.
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:693
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
static bool runOnFunction(Function &F, bool PostInlining)
Provides ErrorOr<T> smart pointer.
static cl::opt< unsigned > SizeLimit("eif-limit", cl::init(6), cl::Hidden, cl::desc("Size limit in Hexagon early if-conversion"))
LVReader * CurrentReader
Definition: LVReader.cpp:153
Implements a lazy call graph analysis and related passes for the new pass manager.
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:81
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file implements a map that provides insertion order iteration.
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
FunctionAnalysisManager FAM
This header defines various interfaces for pass management in LLVM.
This file defines the PriorityQueue class.
This file contains the declarations for profiling metadata utility functions.
This builds on the llvm/ADT/GraphTraits.h file to find the strongly connected components (SCCs) of a ...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file provides the interface for context-sensitive profile tracker used by CSSPGO.
This file provides the interface for the sampled PGO profile loader base implementation.
This file provides the utility functions for the sampled PGO loader base implementation.
This file provides the interface for SampleProfileMatcher.
This file provides the interface for the pseudo probe implementation for AutoFDO.
static cl::opt< std::string > SampleProfileFile("sample-profile-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile file loaded by -sample-profile"), cl::Hidden)
static cl::opt< unsigned > MinfuncsForStalenessError("min-functions-for-staleness-error", cl::Hidden, cl::init(50), cl::desc("Skip the check if the number of hot functions is smaller than " "the specified number."))
static cl::opt< bool > ProfileSampleBlockAccurate("profile-sample-block-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "branches and calls as having 0 samples. Otherwise, treat " "them conservatively as unknown. "))
static cl::opt< unsigned > PrecentMismatchForStalenessError("precent-mismatch-for-staleness-error", cl::Hidden, cl::init(80), cl::desc("Reject the profile if the mismatch percent is higher than the " "given number."))
static cl::opt< bool > RemoveProbeAfterProfileAnnotation("sample-profile-remove-probe", cl::Hidden, cl::init(false), cl::desc("Remove pseudo-probe after sample profile annotation."))
static cl::opt< unsigned > MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden, cl::desc("Max number of promotions for a single indirect " "call callsite in sample profile loader"))
static cl::opt< ReplayInlinerSettings::Fallback > ProfileInlineReplayFallback("sample-profile-inline-replay-fallback", cl::init(ReplayInlinerSettings::Fallback::Original), cl::values(clEnumValN(ReplayInlinerSettings::Fallback::Original, "Original", "All decisions not in replay send to original advisor (default)"), clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline, "AlwaysInline", "All decisions not in replay are inlined"), clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline", "All decisions not in replay are not inlined")), cl::desc("How sample profile inline replay treats sites that don't come " "from the replay. Original: defers to original advisor, " "AlwaysInline: inline all sites not in replay, NeverInline: " "inline no sites not in replay"), cl::Hidden)
static cl::opt< bool > OverwriteExistingWeights("overwrite-existing-weights", cl::Hidden, cl::init(false), cl::desc("Ignore existing branch weights on IR and always overwrite."))
static void updateIDTMetaData(Instruction &Inst, const SmallVectorImpl< InstrProfValueData > &CallTargets, uint64_t Sum)
Update indirect call target profile metadata for Inst.
static cl::opt< bool > AnnotateSampleProfileInlinePhase("annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false), cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for " "sample-profile inline pass name."))
static cl::opt< std::string > ProfileInlineReplayFile("sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"), cl::desc("Optimization remarks file containing inline remarks to be replayed " "by inlining from sample profile loader."), cl::Hidden)
static cl::opt< bool > ProfileMergeInlinee("sample-profile-merge-inlinee", cl::Hidden, cl::init(true), cl::desc("Merge past inlinee's profile to outline version if sample " "profile loader decided not to inline a call site. It will " "only be enabled when top-down order of profile loading is " "enabled. "))
cl::opt< bool > PersistProfileStaleness("persist-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute stale profile statistical metrics and write it into the " "native object file(.llvm_stats section)."))
static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate)
Check whether the indirect call promotion history of Inst allows the promotion for Candidate.
static SmallVector< InstrProfValueData, 2 > GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M)
Returns the sorted CallTargetMap M by count in descending order.
#define CSINLINE_DEBUG
static cl::opt< bool > UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden, cl::desc("Process functions in a top-down order " "defined by the profiled call graph when " "-sample-profile-top-down-load is on."))
static cl::opt< ReplayInlinerSettings::Scope > ProfileInlineReplayScope("sample-profile-inline-replay-scope", cl::init(ReplayInlinerSettings::Scope::Function), cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function", "Replay on functions that have remarks associated " "with them (default)"), clEnumValN(ReplayInlinerSettings::Scope::Module, "Module", "Replay on the entire module")), cl::desc("Whether inline replay should be applied to the entire " "Module or just the Functions (default) that are present as " "callers in remarks during sample profile inlining."), cl::Hidden)
static cl::opt< unsigned > ProfileICPRelativeHotness("sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25), cl::desc("Relative hotness percentage threshold for indirect " "call promotion in proirity-based sample profile loader inlining."))
Function::ProfileCount ProfileCount
static cl::opt< unsigned > ProfileICPRelativeHotnessSkip("sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1), cl::desc("Skip relative hotness check for ICP up to given number of targets."))
cl::opt< bool > ReportProfileStaleness("report-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute and report stale profile statistical metrics."))
static cl::opt< bool > UsePreInlinerDecision("sample-profile-use-preinliner", cl::Hidden, cl::desc("Use the preinliner decisions stored in profile context."))
static cl::opt< bool > ProfileAccurateForSymsInList("profile-accurate-for-symsinlist", cl::Hidden, cl::init(true), cl::desc("For symbols in profile symbol list, regard their profiles to " "be accurate. It may be overriden by profile-sample-accurate. "))
#define DEBUG_TYPE
static cl::opt< bool > DisableSampleLoaderInlining("disable-sample-loader-inlining", cl::Hidden, cl::init(false), cl::desc("If true, artifically skip inline transformation in sample-loader " "pass, and merge (or scale) profiles (as configured by " "--sample-profile-merge-inlinee)."))
static cl::opt< bool > ProfileSizeInline("sample-profile-inline-size", cl::Hidden, cl::init(false), cl::desc("Inline cold call sites in profile loader if it's beneficial " "for code size."))
cl::opt< bool > SalvageStaleProfile("salvage-stale-profile", cl::Hidden, cl::init(false), cl::desc("Salvage stale profile by fuzzy matching and use the remapped " "location for sample profile query."))
static cl::opt< bool > ProfileTopDownLoad("sample-profile-top-down-load", cl::Hidden, cl::init(true), cl::desc("Do profile annotation and inlining for functions in top-down " "order of call graph during sample profile loading. It only " "works for new pass manager. "))
static cl::opt< bool > ProfileSampleAccurate("profile-sample-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "callsite and function as having 0 samples. Otherwise, treat " "un-sampled callsites and functions conservatively as unknown. "))
static cl::opt< bool > AllowRecursiveInline("sample-profile-recursive-inline", cl::Hidden, cl::desc("Allow sample loader inliner to inline recursive calls."))
static cl::opt< CallSiteFormat::Format > ProfileInlineReplayFormat("sample-profile-inline-replay-format", cl::init(CallSiteFormat::Format::LineColumnDiscriminator), cl::values(clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"), clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn", "<Line Number>:<Column Number>"), clEnumValN(CallSiteFormat::Format::LineDiscriminator, "LineDiscriminator", "<Line Number>.<Discriminator>"), clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator, "LineColumnDiscriminator", "<Line Number>:<Column Number>.<Discriminator> (default)")), cl::desc("How sample profile inline replay file is formatted"), cl::Hidden)
static cl::opt< std::string > SampleProfileRemappingFile("sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden)
static cl::opt< unsigned > HotFuncCutoffForStalenessError("hot-func-cutoff-for-staleness-error", cl::Hidden, cl::init(800000), cl::desc("A function is considered hot for staleness error check if its " "total sample count is above the specified percentile"))
static cl::opt< bool > CallsitePrioritizedInline("sample-profile-prioritized-inline", cl::Hidden, cl::desc("Use call site prioritized inlining for sample profile loader." "Currently only CSSPGO is supported."))
This file provides the interface for the sampled PGO loader pass.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1494
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
This class represents a function call, abstracting a target machine's calling convention.
Debug location.
A debug info location.
Definition: DebugLoc.h:33
unsigned getLine() const
Definition: DebugLoc.cpp:24
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Diagnostic information for the sample profiler.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Represents either an error or a value T.
Definition: ErrorOr.h:56
Class to represent profile counts.
Definition: Function.h:279
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1831
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:281
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Represents the cost of inlining a function.
Definition: InlineCost.h:90
static InlineCost getNever(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:131
static InlineCost getAlways(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:126
static InlineCost get(int Cost, int Threshold, int StaticBonus=0)
Definition: InlineCost.h:120
This class captures the data input to the InlineFunction call, and records the auxiliary results prod...
Definition: Cloning.h:202
InlineResult is basically true or false.
Definition: InlineCost.h:180
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:631
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:454
bool extractProfTotalWeight(uint64_t &TotalVal) const
Retrieve total raw weight values of a branch.
Definition: Metadata.cpp:1745
const BasicBlock * getParent() const
Definition: Instruction.h:152
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1636
A smart pointer to a reference-counted object that inherits from RefCountedBase or ThreadSafeRefCount...
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
An analysis pass which computes the call graph for a module.
A node in the call graph.
A RefSCC of the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
iterator_range< postorder_ref_scc_iterator > postorder_ref_sccs()
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
VectorType::iterator erase(typename VectorType::iterator Iterator)
Remove the element given by Iterator.
Definition: MapVector.h:193
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Diagnostic information for optimization analysis remarks.
Diagnostic information for applied optimization remarks.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
void refresh()
If no summary is present, attempt to refresh.
bool isHotCountNthPercentile(int PercentileCutoff, uint64_t C) const
Returns true if count C is considered hot with regard to a given hot percentile cutoff value.
Sample profile inference pass.
void computeDominanceAndLoopInfo(FunctionT &F)
virtual ErrorOr< uint64_t > getInstWeight(const InstructionT &Inst)
Get the weight for an instruction.
virtual const FunctionSamples * findFunctionSamples(const InstructionT &I) const
Get the FunctionSamples for an instruction.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
SampleProfileLoaderPass(std::string File="", std::string RemappingFile="", ThinOrFullLTOPhase LTOPhase=ThinOrFullLTOPhase::None, IntrusiveRefCntPtr< vfs::FileSystem > FS=nullptr)
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
StringSet - A wrapper for StringMap that provides set-like functionality.
Definition: StringSet.h:23
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
This class represents a function that is read from a sample profile.
Definition: FunctionId.h:36
Representation of the samples collected for a function.
Definition: SampleProf.h:744
void findInlinedFunctions(DenseSet< GlobalValue::GUID > &S, const HashKeyMap< std::unordered_map, FunctionId, Function * > &SymbolMap, uint64_t Threshold) const
Recursively traverses all children, if the total sample count of the corresponding function is no les...
Definition: SampleProf.h:1036
FunctionId getFunction() const
Return the function name.
Definition: SampleProf.h:1069
static StringRef getCanonicalFnName(const Function &F)
Return the canonical name for a function, taking into account suffix elision policy attributes.
Definition: SampleProf.h:1085
SampleContext & getContext() const
Definition: SampleProf.h:1185
sampleprof_error merge(const FunctionSamples &Other, uint64_t Weight=1)
Merge the samples in Other into this one.
Definition: SampleProf.h:996
static LineLocation getCallSiteIdentifier(const DILocation *DIL, bool ProfileIsFS=false)
Returns a unique call site identifier for a given debug location of a call instruction.
Definition: SampleProf.cpp:221
uint64_t getHeadSamplesEstimate() const
Return an estimate of the sample count of the function entry basic block.
Definition: SampleProf.h:947
uint64_t getGUID() const
Return the GUID of the context's name.
Definition: SampleProf.h:1204
const BodySampleMap & getBodySamples() const
Return all the samples collected in the body of the function.
Definition: SampleProf.h:971
static bool UseMD5
Whether the profile uses MD5 to represent string.
Definition: SampleProf.h:1190
This class is a wrapper to associative container MapT<KeyT, ValueT> using the hash value of the origi...
Definition: HashKeyMap.h:53
bool hasAttribute(ContextAttributeMask A)
Definition: SampleProf.h:607
This class provides operator overloads to the map container using MD5 as the key type,...
Definition: SampleProf.h:1306
Sample-based profile reader.
static ErrorOr< std::unique_ptr< SampleProfileReader > > create(const std::string Filename, LLVMContext &C, vfs::FileSystem &FS, FSDiscriminatorPass P=FSDiscriminatorPass::Base, const std::string RemapFilename="")
Create a sample profile reader appropriate to the file format.
std::unordered_map< FunctionId, uint64_t > CallTargetMap
Definition: SampleProf.h:338
static const SortedCallTargetSet SortCallTargets(const CallTargetMap &Targets)
Sort call targets in descending order of call frequency.
Definition: SampleProf.h:406
static const CallTargetMap adjustCallTargets(const CallTargetMap &Targets, float DistributionFactor)
Prorate call targets by a distribution factor.
Definition: SampleProf.h:415
Enumerate the SCCs of a directed graph in reverse topological order of the SCC DAG.
Definition: SCCIterator.h:49
bool isAtEnd() const
Direct loop termination test which is more efficient than comparison with end().
Definition: SCCIterator.h:113
Sort the nodes of a directed SCC in the decreasing order of the edge weights.
Definition: SCCIterator.h:253
const CustomOperand< const MCSubtargetInfo & > Msg[]
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ FS
Definition: X86.h:206
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:718
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
void checkExpectAnnotations(Instruction &I, const ArrayRef< uint32_t > ExistingWeights, bool IsFrontend)
checkExpectAnnotations - compares PGO counters to the thresholds used for llvm.expect and warns if th...
Definition: MisExpect.cpp:202
DenseMap< SymbolStringPtr, ExecutorSymbolDef > SymbolMap
A map from symbol names (as SymbolStringPtrs) to JITSymbols (address/flags pairs).
Definition: Core.h:121
DiagnosticInfoOptimizationBase::Argument NV
CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
static FunctionId getRepInFormat(StringRef Name)
Get the proper representation of a string according to whether the current Format uses MD5 to represe...
Definition: SampleProf.h:1292
std::map< FunctionId, FunctionSamples > FunctionSamplesMap
Definition: SampleProf.h:734
bool callsiteIsHot(const FunctionSamples *CallsiteFS, ProfileSummaryInfo *PSI, bool ProfAccForSymsInList)
Return true if the given callsite is hot wrt to hot cutoff threshold.
IntrusiveRefCntPtr< FileSystem > getRealFileSystem()
Gets an vfs::FileSystem for the 'real' file system, as seen by the operating system.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind, uint32_t MaxNumValueData, InstrProfValueData ValueData[], uint32_t &ActualNumValueData, uint64_t &TotalC, bool GetNoICPValue=false)
Extract the value profile data from Inst which is annotated with value profile meta data.
Definition: InstrProf.cpp:1346
bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
cl::opt< int > ProfileInlineLimitMin
bool succ_empty(const Instruction *I)
Definition: CFG.h:255
scc_iterator< T > scc_begin(const T &G)
Construct the begin iterator for a deduced graph type T.
Definition: SCCIterator.h:233
void setProbeDistributionFactor(Instruction &Inst, float Factor)
Definition: PseudoProbe.cpp:76
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
std::string AnnotateInlinePassName(InlineContext IC)
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
Definition: Pass.h:76
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
cl::opt< bool > SampleProfileUseProfi
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:1229
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
llvm::cl::opt< bool > UseIterativeBFIInference
std::optional< PseudoProbe > extractProbe(const Instruction &Inst)
Definition: PseudoProbe.cpp:56
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void emitInlinedIntoBasedOnCost(OptimizationRemarkEmitter &ORE, DebugLoc DLoc, const BasicBlock *Block, const Function &Callee, const Function &Caller, const InlineCost &IC, bool ForProfileContext=false, const char *PassName=nullptr)
Emit ORE message based in cost (default heuristic).
std::unique_ptr< InlineAdvisor > getReplayInlineAdvisor(Module &M, FunctionAnalysisManager &FAM, LLVMContext &Context, std::unique_ptr< InlineAdvisor > OriginalAdvisor, const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks, InlineContext IC)
cl::opt< int > SampleHotCallSiteThreshold
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
void updateProfileCallee(Function *Callee, int64_t EntryDelta, const ValueMap< const Value *, WeakTrackingVH > *VMap=nullptr)
Updates profile information by adjusting the entry count by adding EntryDelta then scaling callsite i...
cl::opt< int > SampleColdCallSiteThreshold
InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, bool MergeAttributes=false, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true, Function *ForwardVarArgsTo=nullptr)
This function inlines the called function into the basic block of the caller.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1849
@ DS_Warning
static bool skipProfileForFunction(const Function &F)
cl::opt< bool > SortProfiledSCC
cl::opt< int > ProfileInlineLimitMax
cl::opt< bool > EnableExtTspBlockPlacement
const uint64_t NOMORE_ICP_MAGICNUM
Magic number in the value profile metadata showing a target has been promoted for the instruction and...
Definition: Metadata.h:57
cl::opt< int > ProfileInlineGrowthLimit
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
Used in the streaming interface as the general argument type.
A wrapper of binary function with basic blocks and jumps.
Provides context on when an inline advisor is constructed in the pipeline (e.g., link phase,...
Definition: InlineAdvisor.h:59
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:206
std::optional< bool > AllowRecursiveCall
Indicate whether we allow inlining for recursive call.
Definition: InlineCost.h:239
std::optional< bool > ComputeFullInlineCost
Compute inline cost even when the cost has exceeded the threshold.
Definition: InlineCost.h:233