LLVM 20.0.0git
SampleProfile.cpp
Go to the documentation of this file.
1//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the SampleProfileLoader transformation. This pass
10// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
11// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
12// profile information in the given profile.
13//
14// This pass generates branch weight annotations on the IR:
15//
16// - prof: Represents branch weights. This annotation is added to branches
17// to indicate the weights of each edge coming out of the branch.
18// The weight of each edge is the weight of the target block for
19// that edge. The weight of a block B is computed as the maximum
20// number of samples found in B.
21//
22//===----------------------------------------------------------------------===//
23
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/ADT/DenseMap.h"
27#include "llvm/ADT/DenseSet.h"
28#include "llvm/ADT/MapVector.h"
32#include "llvm/ADT/Statistic.h"
33#include "llvm/ADT/StringRef.h"
34#include "llvm/ADT/Twine.h"
45#include "llvm/IR/BasicBlock.h"
46#include "llvm/IR/DebugLoc.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalValue.h"
50#include "llvm/IR/InstrTypes.h"
51#include "llvm/IR/Instruction.h"
54#include "llvm/IR/LLVMContext.h"
55#include "llvm/IR/MDBuilder.h"
56#include "llvm/IR/Module.h"
57#include "llvm/IR/PassManager.h"
59#include "llvm/IR/PseudoProbe.h"
66#include "llvm/Support/Debug.h"
70#include "llvm/Transforms/IPO.h"
81#include <algorithm>
82#include <cassert>
83#include <cstdint>
84#include <functional>
85#include <limits>
86#include <map>
87#include <memory>
88#include <queue>
89#include <string>
90#include <system_error>
91#include <utility>
92#include <vector>
93
94using namespace llvm;
95using namespace sampleprof;
96using namespace llvm::sampleprofutil;
98#define DEBUG_TYPE "sample-profile"
99#define CSINLINE_DEBUG DEBUG_TYPE "-inline"
100
101STATISTIC(NumCSInlined,
102 "Number of functions inlined with context sensitive profile");
103STATISTIC(NumCSNotInlined,
104 "Number of functions not inlined with context sensitive profile");
105STATISTIC(NumMismatchedProfile,
106 "Number of functions with CFG mismatched profile");
107STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
108STATISTIC(NumDuplicatedInlinesite,
109 "Number of inlined callsites with a partial distribution factor");
110
111STATISTIC(NumCSInlinedHitMinLimit,
112 "Number of functions with FDO inline stopped due to min size limit");
113STATISTIC(NumCSInlinedHitMaxLimit,
114 "Number of functions with FDO inline stopped due to max size limit");
116 NumCSInlinedHitGrowthLimit,
117 "Number of functions with FDO inline stopped due to growth size limit");
118
119// Command line option to specify the file to read samples from. This is
120// mainly used for debugging.
122 "sample-profile-file", cl::init(""), cl::value_desc("filename"),
123 cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
124
125// The named file contains a set of transformations that may have been applied
126// to the symbol names between the program from which the sample data was
127// collected and the current program's symbols.
129 "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
130 cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
131
133 "salvage-stale-profile", cl::Hidden, cl::init(false),
134 cl::desc("Salvage stale profile by fuzzy matching and use the remapped "
135 "location for sample profile query."));
137 SalvageUnusedProfile("salvage-unused-profile", cl::Hidden, cl::init(false),
138 cl::desc("Salvage unused profile by matching with new "
139 "functions on call graph."));
140
142 "report-profile-staleness", cl::Hidden, cl::init(false),
143 cl::desc("Compute and report stale profile statistical metrics."));
144
146 "persist-profile-staleness", cl::Hidden, cl::init(false),
147 cl::desc("Compute stale profile statistical metrics and write it into the "
148 "native object file(.llvm_stats section)."));
149
151 "profile-sample-accurate", cl::Hidden, cl::init(false),
152 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
153 "callsite and function as having 0 samples. Otherwise, treat "
154 "un-sampled callsites and functions conservatively as unknown. "));
155
157 "profile-sample-block-accurate", cl::Hidden, cl::init(false),
158 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
159 "branches and calls as having 0 samples. Otherwise, treat "
160 "them conservatively as unknown. "));
161
163 "profile-accurate-for-symsinlist", cl::Hidden, cl::init(true),
164 cl::desc("For symbols in profile symbol list, regard their profiles to "
165 "be accurate. It may be overriden by profile-sample-accurate. "));
166
168 "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
169 cl::desc("Merge past inlinee's profile to outline version if sample "
170 "profile loader decided not to inline a call site. It will "
171 "only be enabled when top-down order of profile loading is "
172 "enabled. "));
173
175 "sample-profile-top-down-load", cl::Hidden, cl::init(true),
176 cl::desc("Do profile annotation and inlining for functions in top-down "
177 "order of call graph during sample profile loading. It only "
178 "works for new pass manager. "));
179
180static cl::opt<bool>
181 UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden,
182 cl::desc("Process functions in a top-down order "
183 "defined by the profiled call graph when "
184 "-sample-profile-top-down-load is on."));
185
187 "sample-profile-inline-size", cl::Hidden, cl::init(false),
188 cl::desc("Inline cold call sites in profile loader if it's beneficial "
189 "for code size."));
190
191// Since profiles are consumed by many passes, turning on this option has
192// side effects. For instance, pre-link SCC inliner would see merged profiles
193// and inline the hot functions (that are skipped in this pass).
195 "disable-sample-loader-inlining", cl::Hidden, cl::init(false),
196 cl::desc("If true, artifically skip inline transformation in sample-loader "
197 "pass, and merge (or scale) profiles (as configured by "
198 "--sample-profile-merge-inlinee)."));
199
200namespace llvm {
202 SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,
203 cl::desc("Sort profiled recursion by edge weights."));
204
206 "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
207 cl::desc("The size growth ratio limit for proirity-based sample profile "
208 "loader inlining."));
209
211 "sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
212 cl::desc("The lower bound of size growth limit for "
213 "proirity-based sample profile loader inlining."));
214
216 "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
217 cl::desc("The upper bound of size growth limit for "
218 "proirity-based sample profile loader inlining."));
219
221 "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
222 cl::desc("Hot callsite threshold for proirity-based sample profile loader "
223 "inlining."));
224
226 "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
227 cl::desc("Threshold for inlining cold callsites"));
228} // namespace llvm
229
231 "sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25),
232 cl::desc(
233 "Relative hotness percentage threshold for indirect "
234 "call promotion in proirity-based sample profile loader inlining."));
235
237 "sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1),
238 cl::desc(
239 "Skip relative hotness check for ICP up to given number of targets."));
240
242 "hot-func-cutoff-for-staleness-error", cl::Hidden, cl::init(800000),
243 cl::desc("A function is considered hot for staleness error check if its "
244 "total sample count is above the specified percentile"));
245
247 "min-functions-for-staleness-error", cl::Hidden, cl::init(50),
248 cl::desc("Skip the check if the number of hot functions is smaller than "
249 "the specified number."));
250
252 "precent-mismatch-for-staleness-error", cl::Hidden, cl::init(80),
253 cl::desc("Reject the profile if the mismatch percent is higher than the "
254 "given number."));
255
257 "sample-profile-prioritized-inline", cl::Hidden,
258 cl::desc("Use call site prioritized inlining for sample profile loader."
259 "Currently only CSSPGO is supported."));
260
262 "sample-profile-use-preinliner", cl::Hidden,
263 cl::desc("Use the preinliner decisions stored in profile context."));
264
266 "sample-profile-recursive-inline", cl::Hidden,
267 cl::desc("Allow sample loader inliner to inline recursive calls."));
268
270 "sample-profile-remove-probe", cl::Hidden, cl::init(false),
271 cl::desc("Remove pseudo-probe after sample profile annotation."));
272
274 "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
275 cl::desc(
276 "Optimization remarks file containing inline remarks to be replayed "
277 "by inlining from sample profile loader."),
278 cl::Hidden);
279
281 "sample-profile-inline-replay-scope",
282 cl::init(ReplayInlinerSettings::Scope::Function),
283 cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function",
284 "Replay on functions that have remarks associated "
285 "with them (default)"),
286 clEnumValN(ReplayInlinerSettings::Scope::Module, "Module",
287 "Replay on the entire module")),
288 cl::desc("Whether inline replay should be applied to the entire "
289 "Module or just the Functions (default) that are present as "
290 "callers in remarks during sample profile inlining."),
291 cl::Hidden);
292
294 "sample-profile-inline-replay-fallback",
295 cl::init(ReplayInlinerSettings::Fallback::Original),
298 ReplayInlinerSettings::Fallback::Original, "Original",
299 "All decisions not in replay send to original advisor (default)"),
300 clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline,
301 "AlwaysInline", "All decisions not in replay are inlined"),
302 clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline",
303 "All decisions not in replay are not inlined")),
304 cl::desc("How sample profile inline replay treats sites that don't come "
305 "from the replay. Original: defers to original advisor, "
306 "AlwaysInline: inline all sites not in replay, NeverInline: "
307 "inline no sites not in replay"),
308 cl::Hidden);
309
311 "sample-profile-inline-replay-format",
312 cl::init(CallSiteFormat::Format::LineColumnDiscriminator),
314 clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"),
315 clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn",
316 "<Line Number>:<Column Number>"),
317 clEnumValN(CallSiteFormat::Format::LineDiscriminator,
318 "LineDiscriminator", "<Line Number>.<Discriminator>"),
319 clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator,
320 "LineColumnDiscriminator",
321 "<Line Number>:<Column Number>.<Discriminator> (default)")),
322 cl::desc("How sample profile inline replay file is formatted"), cl::Hidden);
323
325 MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden,
326 cl::desc("Max number of promotions for a single indirect "
327 "call callsite in sample profile loader"));
328
330 "overwrite-existing-weights", cl::Hidden, cl::init(false),
331 cl::desc("Ignore existing branch weights on IR and always overwrite."));
332
334 "annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false),
335 cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for "
336 "sample-profile inline pass name."));
337
338namespace llvm {
340}
341
342namespace {
343
344using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
345using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
346using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
347using EdgeWeightMap = DenseMap<Edge, uint64_t>;
348using BlockEdgeMap =
350
351class GUIDToFuncNameMapper {
352public:
353 GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
354 DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
355 : CurrentReader(Reader), CurrentModule(M),
356 CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
357 if (!CurrentReader.useMD5())
358 return;
359
360 for (const auto &F : CurrentModule) {
361 StringRef OrigName = F.getName();
362 CurrentGUIDToFuncNameMap.insert(
363 {Function::getGUID(OrigName), OrigName});
364
365 // Local to global var promotion used by optimization like thinlto
366 // will rename the var and add suffix like ".llvm.xxx" to the
367 // original local name. In sample profile, the suffixes of function
368 // names are all stripped. Since it is possible that the mapper is
369 // built in post-thin-link phase and var promotion has been done,
370 // we need to add the substring of function name without the suffix
371 // into the GUIDToFuncNameMap.
373 if (CanonName != OrigName)
374 CurrentGUIDToFuncNameMap.insert(
375 {Function::getGUID(CanonName), CanonName});
376 }
377
378 // Update GUIDToFuncNameMap for each function including inlinees.
379 SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
380 }
381
382 ~GUIDToFuncNameMapper() {
383 if (!CurrentReader.useMD5())
384 return;
385
386 CurrentGUIDToFuncNameMap.clear();
387
388 // Reset GUIDToFuncNameMap for of each function as they're no
389 // longer valid at this point.
390 SetGUIDToFuncNameMapForAll(nullptr);
391 }
392
393private:
394 void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
395 std::queue<FunctionSamples *> FSToUpdate;
396 for (auto &IFS : CurrentReader.getProfiles()) {
397 FSToUpdate.push(&IFS.second);
398 }
399
400 while (!FSToUpdate.empty()) {
401 FunctionSamples *FS = FSToUpdate.front();
402 FSToUpdate.pop();
403 FS->GUIDToFuncNameMap = Map;
404 for (const auto &ICS : FS->getCallsiteSamples()) {
405 const FunctionSamplesMap &FSMap = ICS.second;
406 for (const auto &IFS : FSMap) {
407 FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
408 FSToUpdate.push(&FS);
409 }
410 }
411 }
412 }
413
415 Module &CurrentModule;
416 DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
417};
418
419// Inline candidate used by iterative callsite prioritized inliner
420struct InlineCandidate {
421 CallBase *CallInstr;
422 const FunctionSamples *CalleeSamples;
423 // Prorated callsite count, which will be used to guide inlining. For example,
424 // if a callsite is duplicated in LTO prelink, then in LTO postlink the two
425 // copies will get their own distribution factors and their prorated counts
426 // will be used to decide if they should be inlined independently.
427 uint64_t CallsiteCount;
428 // Call site distribution factor to prorate the profile samples for a
429 // duplicated callsite. Default value is 1.0.
430 float CallsiteDistribution;
431};
432
433// Inline candidate comparer using call site weight
434struct CandidateComparer {
435 bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {
436 if (LHS.CallsiteCount != RHS.CallsiteCount)
437 return LHS.CallsiteCount < RHS.CallsiteCount;
438
439 const FunctionSamples *LCS = LHS.CalleeSamples;
440 const FunctionSamples *RCS = RHS.CalleeSamples;
441 // In inline replay mode, CalleeSamples may be null and the order doesn't
442 // matter.
443 if (!LCS || !RCS)
444 return LCS;
445
446 // Tie breaker using number of samples try to favor smaller functions first
447 if (LCS->getBodySamples().size() != RCS->getBodySamples().size())
448 return LCS->getBodySamples().size() > RCS->getBodySamples().size();
449
450 // Tie breaker using GUID so we have stable/deterministic inlining order
451 return LCS->getGUID() < RCS->getGUID();
452 }
453};
454
455using CandidateQueue =
457 CandidateComparer>;
458
459/// Sample profile pass.
460///
461/// This pass reads profile data from the file specified by
462/// -sample-profile-file and annotates every affected function with the
463/// profile information found in that file.
464class SampleProfileLoader final : public SampleProfileLoaderBaseImpl<Function> {
465public:
466 SampleProfileLoader(
467 StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase,
469 std::function<AssumptionCache &(Function &)> GetAssumptionCache,
470 std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
471 std::function<const TargetLibraryInfo &(Function &)> GetTLI,
472 LazyCallGraph &CG, bool DisableSampleProfileInlining,
473 bool UseFlattenedProfile)
475 std::move(FS)),
476 GetAC(std::move(GetAssumptionCache)),
477 GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
478 CG(CG), LTOPhase(LTOPhase),
479 AnnotatedPassName(AnnotateSampleProfileInlinePhase
483 DisableSampleProfileInlining(DisableSampleProfileInlining),
484 UseFlattenedProfile(UseFlattenedProfile) {}
485
486 bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
487 bool runOnModule(Module &M, ModuleAnalysisManager *AM,
488 ProfileSummaryInfo *_PSI);
489
490protected:
492 bool emitAnnotations(Function &F);
494 const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
495 const FunctionSamples *
496 findFunctionSamples(const Instruction &I) const override;
497 std::vector<const FunctionSamples *>
498 findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
499 void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples,
500 DenseSet<GlobalValue::GUID> &InlinedGUIDs,
501 uint64_t Threshold);
502 // Attempt to promote indirect call and also inline the promoted call
503 bool tryPromoteAndInlineCandidate(
504 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
505 uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
506
507 bool inlineHotFunctions(Function &F,
508 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
509 std::optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB);
510 bool getExternalInlineAdvisorShouldInline(CallBase &CB);
511 InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
512 bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
513 bool
514 tryInlineCandidate(InlineCandidate &Candidate,
515 SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
516 bool
517 inlineHotFunctionsWithPriority(Function &F,
518 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
519 // Inline cold/small functions in addition to hot ones
520 bool shouldInlineColdCallee(CallBase &CallInst);
521 void emitOptimizationRemarksForInlineCandidates(
522 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
523 bool Hot);
524 void promoteMergeNotInlinedContextSamples(
526 const Function &F);
527 std::vector<Function *> buildFunctionOrder(Module &M, LazyCallGraph &CG);
528 std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(Module &M);
529 void generateMDProfMetadata(Function &F);
530 bool rejectHighStalenessProfile(Module &M, ProfileSummaryInfo *PSI,
531 const SampleProfileMap &Profiles);
532 void removePseudoProbeInstsDiscriminator(Module &M);
533
534 /// Map from function name to Function *. Used to find the function from
535 /// the function name. If the function name contains suffix, additional
536 /// entry is added to map from the stripped name to the function if there
537 /// is one-to-one mapping.
539
540 /// Map from function name to profile name generated by call-graph based
541 /// profile fuzzy matching(--salvage-unused-profile).
543
544 std::function<AssumptionCache &(Function &)> GetAC;
545 std::function<TargetTransformInfo &(Function &)> GetTTI;
546 std::function<const TargetLibraryInfo &(Function &)> GetTLI;
547 LazyCallGraph &CG;
548
549 /// Profile tracker for different context.
550 std::unique_ptr<SampleContextTracker> ContextTracker;
551
552 /// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
553 ///
554 /// We need to know the LTO phase because for example in ThinLTOPrelink
555 /// phase, in annotation, we should not promote indirect calls. Instead,
556 /// we will mark GUIDs that needs to be annotated to the function.
557 const ThinOrFullLTOPhase LTOPhase;
558 const std::string AnnotatedPassName;
559
560 /// Profle Symbol list tells whether a function name appears in the binary
561 /// used to generate the current profile.
562 std::shared_ptr<ProfileSymbolList> PSL;
563
564 /// Total number of samples collected in this profile.
565 ///
566 /// This is the sum of all the samples collected in all the functions executed
567 /// at runtime.
568 uint64_t TotalCollectedSamples = 0;
569
570 // Information recorded when we declined to inline a call site
571 // because we have determined it is too cold is accumulated for
572 // each callee function. Initially this is just the entry count.
573 struct NotInlinedProfileInfo {
574 uint64_t entryCount;
575 };
577
578 // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
579 // all the function symbols defined or declared in current module.
580 DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
581
582 // All the Names used in FunctionSamples including outline function
583 // names, inline instance names and call target names.
584 StringSet<> NamesInProfile;
585 // MD5 version of NamesInProfile. Either NamesInProfile or GUIDsInProfile is
586 // populated, depends on whether the profile uses MD5. Because the name table
587 // generally contains several magnitude more entries than the number of
588 // functions, we do not want to convert all names from one form to another.
589 llvm::DenseSet<uint64_t> GUIDsInProfile;
590
591 // For symbol in profile symbol list, whether to regard their profiles
592 // to be accurate. It is mainly decided by existance of profile symbol
593 // list and -profile-accurate-for-symsinlist flag, but it can be
594 // overriden by -profile-sample-accurate or profile-sample-accurate
595 // attribute.
596 bool ProfAccForSymsInList;
597
598 bool DisableSampleProfileInlining;
599
600 bool UseFlattenedProfile;
601
602 // External inline advisor used to replay inline decision from remarks.
603 std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor;
604
605 // A helper to implement the sample profile matching algorithm.
606 std::unique_ptr<SampleProfileMatcher> MatchingManager;
607
608private:
609 const char *getAnnotatedRemarkPassName() const {
610 return AnnotatedPassName.c_str();
611 }
612};
613} // end anonymous namespace
614
615namespace llvm {
616template <>
618 return succ_empty(BB);
619}
620
621template <>
623 const std::vector<const BasicBlockT *> &BasicBlocks,
624 BlockEdgeMap &Successors, FlowFunction &Func) {
625 for (auto &Jump : Func.Jumps) {
626 const auto *BB = BasicBlocks[Jump.Source];
627 const auto *Succ = BasicBlocks[Jump.Target];
628 const Instruction *TI = BB->getTerminator();
629 // Check if a block ends with InvokeInst and mark non-taken branch unlikely.
630 // In that case block Succ should be a landing pad
631 if (Successors[BB].size() == 2 && Successors[BB].back() == Succ) {
632 if (isa<InvokeInst>(TI)) {
633 Jump.IsUnlikely = true;
634 }
635 }
636 const Instruction *SuccTI = Succ->getTerminator();
637 // Check if the target block contains UnreachableInst and mark it unlikely
638 if (SuccTI->getNumSuccessors() == 0) {
639 if (isa<UnreachableInst>(SuccTI)) {
640 Jump.IsUnlikely = true;
641 }
642 }
643 }
644}
645
646template <>
648 Function &F) {
649 DT.reset(new DominatorTree);
650 DT->recalculate(F);
651
652 PDT.reset(new PostDominatorTree(F));
653
654 LI.reset(new LoopInfo);
655 LI->analyze(*DT);
656}
657} // namespace llvm
658
659ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
661 return getProbeWeight(Inst);
662
663 const DebugLoc &DLoc = Inst.getDebugLoc();
664 if (!DLoc)
665 return std::error_code();
666
667 // Ignore all intrinsics, phinodes and branch instructions.
668 // Branch and phinodes instruction usually contains debug info from sources
669 // outside of the residing basic block, thus we ignore them during annotation.
670 if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
671 return std::error_code();
672
673 // For non-CS profile, if a direct call/invoke instruction is inlined in
674 // profile (findCalleeFunctionSamples returns non-empty result), but not
675 // inlined here, it means that the inlined callsite has no sample, thus the
676 // call instruction should have 0 count.
677 // For CS profile, the callsite count of previously inlined callees is
678 // populated with the entry count of the callees.
680 if (const auto *CB = dyn_cast<CallBase>(&Inst))
681 if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
682 return 0;
683
684 return getInstWeightImpl(Inst);
685}
686
687/// Get the FunctionSamples for a call instruction.
688///
689/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
690/// instance in which that call instruction is calling to. It contains
691/// all samples that resides in the inlined instance. We first find the
692/// inlined instance in which the call instruction is from, then we
693/// traverse its children to find the callsite with the matching
694/// location.
695///
696/// \param Inst Call/Invoke instruction to query.
697///
698/// \returns The FunctionSamples pointer to the inlined instance.
699const FunctionSamples *
700SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
701 const DILocation *DIL = Inst.getDebugLoc();
702 if (!DIL) {
703 return nullptr;
704 }
705
706 StringRef CalleeName;
707 if (Function *Callee = Inst.getCalledFunction())
708 CalleeName = Callee->getName();
709
711 return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
712
713 const FunctionSamples *FS = findFunctionSamples(Inst);
714 if (FS == nullptr)
715 return nullptr;
716
717 return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL),
718 CalleeName, Reader->getRemapper(),
719 &FuncNameToProfNameMap);
720}
721
722/// Returns a vector of FunctionSamples that are the indirect call targets
723/// of \p Inst. The vector is sorted by the total number of samples. Stores
724/// the total call count of the indirect call in \p Sum.
725std::vector<const FunctionSamples *>
726SampleProfileLoader::findIndirectCallFunctionSamples(
727 const Instruction &Inst, uint64_t &Sum) const {
728 const DILocation *DIL = Inst.getDebugLoc();
729 std::vector<const FunctionSamples *> R;
730
731 if (!DIL) {
732 return R;
733 }
734
735 auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
736 assert(L && R && "Expect non-null FunctionSamples");
737 if (L->getHeadSamplesEstimate() != R->getHeadSamplesEstimate())
738 return L->getHeadSamplesEstimate() > R->getHeadSamplesEstimate();
739 return L->getGUID() < R->getGUID();
740 };
741
743 auto CalleeSamples =
744 ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
745 if (CalleeSamples.empty())
746 return R;
747
748 // For CSSPGO, we only use target context profile's entry count
749 // as that already includes both inlined callee and non-inlined ones..
750 Sum = 0;
751 for (const auto *const FS : CalleeSamples) {
752 Sum += FS->getHeadSamplesEstimate();
753 R.push_back(FS);
754 }
755 llvm::sort(R, FSCompare);
756 return R;
757 }
758
759 const FunctionSamples *FS = findFunctionSamples(Inst);
760 if (FS == nullptr)
761 return R;
762
764 Sum = 0;
765 if (auto T = FS->findCallTargetMapAt(CallSite))
766 for (const auto &T_C : *T)
767 Sum += T_C.second;
768 if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
769 if (M->empty())
770 return R;
771 for (const auto &NameFS : *M) {
772 Sum += NameFS.second.getHeadSamplesEstimate();
773 R.push_back(&NameFS.second);
774 }
775 llvm::sort(R, FSCompare);
776 }
777 return R;
778}
779
780const FunctionSamples *
781SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
783 std::optional<PseudoProbe> Probe = extractProbe(Inst);
784 if (!Probe)
785 return nullptr;
786 }
787
788 const DILocation *DIL = Inst.getDebugLoc();
789 if (!DIL)
790 return Samples;
791
792 auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
793 if (it.second) {
795 it.first->second = ContextTracker->getContextSamplesFor(DIL);
796 else
797 it.first->second = Samples->findFunctionSamples(
798 DIL, Reader->getRemapper(), &FuncNameToProfNameMap);
799 }
800 return it.first->second;
801}
802
803/// Check whether the indirect call promotion history of \p Inst allows
804/// the promotion for \p Candidate.
805/// If the profile count for the promotion candidate \p Candidate is
806/// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted
807/// for \p Inst. If we already have at least MaxNumPromotions
808/// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we
809/// cannot promote for \p Inst anymore.
810static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) {
811 uint64_t TotalCount = 0;
812 auto ValueData = getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget,
813 MaxNumPromotions, TotalCount, true);
814 // No valid value profile so no promoted targets have been recorded
815 // before. Ok to do ICP.
816 if (ValueData.empty())
817 return true;
818
819 unsigned NumPromoted = 0;
820 for (const auto &V : ValueData) {
821 if (V.Count != NOMORE_ICP_MAGICNUM)
822 continue;
823
824 // If the promotion candidate has NOMORE_ICP_MAGICNUM count in the
825 // metadata, it means the candidate has been promoted for this
826 // indirect call.
827 if (V.Value == Function::getGUID(Candidate))
828 return false;
829 NumPromoted++;
830 // If already have MaxNumPromotions promotion, don't do it anymore.
831 if (NumPromoted == MaxNumPromotions)
832 return false;
833 }
834 return true;
835}
836
837/// Update indirect call target profile metadata for \p Inst.
838/// Usually \p Sum is the sum of counts of all the targets for \p Inst.
839/// If it is 0, it means updateIDTMetaData is used to mark a
840/// certain target to be promoted already. If it is not zero,
841/// we expect to use it to update the total count in the value profile.
842static void
844 const SmallVectorImpl<InstrProfValueData> &CallTargets,
845 uint64_t Sum) {
846 // Bail out early if MaxNumPromotions is zero.
847 // This prevents allocating an array of zero length below.
848 //
849 // Note `updateIDTMetaData` is called in two places so check
850 // `MaxNumPromotions` inside it.
851 if (MaxNumPromotions == 0)
852 return;
853 // OldSum is the existing total count in the value profile data.
854 uint64_t OldSum = 0;
855 auto ValueData = getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget,
856 MaxNumPromotions, OldSum, true);
857
858 DenseMap<uint64_t, uint64_t> ValueCountMap;
859 if (Sum == 0) {
860 assert((CallTargets.size() == 1 &&
861 CallTargets[0].Count == NOMORE_ICP_MAGICNUM) &&
862 "If sum is 0, assume only one element in CallTargets "
863 "with count being NOMORE_ICP_MAGICNUM");
864 // Initialize ValueCountMap with existing value profile data.
865 for (const auto &V : ValueData)
866 ValueCountMap[V.Value] = V.Count;
867 auto Pair =
868 ValueCountMap.try_emplace(CallTargets[0].Value, CallTargets[0].Count);
869 // If the target already exists in value profile, decrease the total
870 // count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM.
871 if (!Pair.second) {
872 OldSum -= Pair.first->second;
873 Pair.first->second = NOMORE_ICP_MAGICNUM;
874 }
875 Sum = OldSum;
876 } else {
877 // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM
878 // counts in the value profile.
879 for (const auto &V : ValueData) {
880 if (V.Count == NOMORE_ICP_MAGICNUM)
881 ValueCountMap[V.Value] = V.Count;
882 }
883
884 for (const auto &Data : CallTargets) {
885 auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count);
886 if (Pair.second)
887 continue;
888 // The target represented by Data.Value has already been promoted.
889 // Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease
890 // Sum by Data.Count.
891 assert(Sum >= Data.Count && "Sum should never be less than Data.Count");
892 Sum -= Data.Count;
893 }
894 }
895
897 for (const auto &ValueCount : ValueCountMap) {
898 NewCallTargets.emplace_back(
899 InstrProfValueData{ValueCount.first, ValueCount.second});
900 }
901
902 llvm::sort(NewCallTargets,
903 [](const InstrProfValueData &L, const InstrProfValueData &R) {
904 if (L.Count != R.Count)
905 return L.Count > R.Count;
906 return L.Value > R.Value;
907 });
908
909 uint32_t MaxMDCount =
910 std::min(NewCallTargets.size(), static_cast<size_t>(MaxNumPromotions));
911 annotateValueSite(*Inst.getParent()->getParent()->getParent(), Inst,
912 NewCallTargets, Sum, IPVK_IndirectCallTarget, MaxMDCount);
913}
914
915/// Attempt to promote indirect call and also inline the promoted call.
916///
917/// \param F Caller function.
918/// \param Candidate ICP and inline candidate.
919/// \param SumOrigin Original sum of target counts for indirect call before
920/// promoting given candidate.
921/// \param Sum Prorated sum of remaining target counts for indirect call
922/// after promoting given candidate.
923/// \param InlinedCallSite Output vector for new call sites exposed after
924/// inlining.
925bool SampleProfileLoader::tryPromoteAndInlineCandidate(
926 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
927 SmallVector<CallBase *, 8> *InlinedCallSite) {
928 // Bail out early if sample-loader inliner is disabled.
929 if (DisableSampleProfileInlining)
930 return false;
931
932 // Bail out early if MaxNumPromotions is zero.
933 // This prevents allocating an array of zero length in callees below.
934 if (MaxNumPromotions == 0)
935 return false;
936 auto CalleeFunctionName = Candidate.CalleeSamples->getFunction();
937 auto R = SymbolMap.find(CalleeFunctionName);
938 if (R == SymbolMap.end() || !R->second)
939 return false;
940
941 auto &CI = *Candidate.CallInstr;
942 if (!doesHistoryAllowICP(CI, R->second->getName()))
943 return false;
944
945 const char *Reason = "Callee function not available";
946 // R->getValue() != &F is to prevent promoting a recursive call.
947 // If it is a recursive call, we do not inline it as it could bloat
948 // the code exponentially. There is way to better handle this, e.g.
949 // clone the caller first, and inline the cloned caller if it is
950 // recursive. As llvm does not inline recursive calls, we will
951 // simply ignore it instead of handling it explicitly.
952 if (!R->second->isDeclaration() && R->second->getSubprogram() &&
953 R->second->hasFnAttribute("use-sample-profile") &&
954 R->second != &F && isLegalToPromote(CI, R->second, &Reason)) {
955 // For promoted target, set its value with NOMORE_ICP_MAGICNUM count
956 // in the value profile metadata so the target won't be promoted again.
957 SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{
958 Function::getGUID(R->second->getName()), NOMORE_ICP_MAGICNUM}};
959 updateIDTMetaData(CI, SortedCallTargets, 0);
960
961 auto *DI = &pgo::promoteIndirectCall(
962 CI, R->second, Candidate.CallsiteCount, Sum, false, ORE);
963 if (DI) {
964 Sum -= Candidate.CallsiteCount;
965 // Do not prorate the indirect callsite distribution since the original
966 // distribution will be used to scale down non-promoted profile target
967 // counts later. By doing this we lose track of the real callsite count
968 // for the leftover indirect callsite as a trade off for accurate call
969 // target counts.
970 // TODO: Ideally we would have two separate factors, one for call site
971 // counts and one is used to prorate call target counts.
972 // Do not update the promoted direct callsite distribution at this
973 // point since the original distribution combined with the callee profile
974 // will be used to prorate callsites from the callee if inlined. Once not
975 // inlined, the direct callsite distribution should be prorated so that
976 // the it will reflect the real callsite counts.
977 Candidate.CallInstr = DI;
978 if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {
979 bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);
980 if (!Inlined) {
981 // Prorate the direct callsite distribution so that it reflects real
982 // callsite counts.
984 *DI, static_cast<float>(Candidate.CallsiteCount) / SumOrigin);
985 }
986 return Inlined;
987 }
988 }
989 } else {
990 LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
992 Candidate.CallInstr->getName())<< " because "
993 << Reason << "\n");
994 }
995 return false;
996}
997
998bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
1000 return false;
1001
1003 if (Callee == nullptr)
1004 return false;
1005
1007 GetAC, GetTLI);
1008
1009 if (Cost.isNever())
1010 return false;
1011
1012 if (Cost.isAlways())
1013 return true;
1014
1015 return Cost.getCost() <= SampleColdCallSiteThreshold;
1016}
1017
1018void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
1019 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
1020 bool Hot) {
1021 for (auto *I : Candidates) {
1022 Function *CalledFunction = I->getCalledFunction();
1023 if (CalledFunction) {
1024 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1025 "InlineAttempt", I->getDebugLoc(),
1026 I->getParent())
1027 << "previous inlining reattempted for "
1028 << (Hot ? "hotness: '" : "size: '")
1029 << ore::NV("Callee", CalledFunction) << "' into '"
1030 << ore::NV("Caller", &F) << "'");
1031 }
1032 }
1033}
1034
1035void SampleProfileLoader::findExternalInlineCandidate(
1036 CallBase *CB, const FunctionSamples *Samples,
1037 DenseSet<GlobalValue::GUID> &InlinedGUIDs, uint64_t Threshold) {
1038
1039 // If ExternalInlineAdvisor(ReplayInlineAdvisor) wants to inline an external
1040 // function make sure it's imported
1041 if (CB && getExternalInlineAdvisorShouldInline(*CB)) {
1042 // Samples may not exist for replayed function, if so
1043 // just add the direct GUID and move on
1044 if (!Samples) {
1045 InlinedGUIDs.insert(
1046 Function::getGUID(CB->getCalledFunction()->getName()));
1047 return;
1048 }
1049 // Otherwise, drop the threshold to import everything that we can
1050 Threshold = 0;
1051 }
1052
1053 // In some rare cases, call instruction could be changed after being pushed
1054 // into inline candidate queue, this is because earlier inlining may expose
1055 // constant propagation which can change indirect call to direct call. When
1056 // this happens, we may fail to find matching function samples for the
1057 // candidate later, even if a match was found when the candidate was enqueued.
1058 if (!Samples)
1059 return;
1060
1061 // For AutoFDO profile, retrieve candidate profiles by walking over
1062 // the nested inlinee profiles.
1064 // Set threshold to zero to honor pre-inliner decision.
1066 Threshold = 0;
1067 Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);
1068 return;
1069 }
1070
1071 ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(Samples);
1072 std::queue<ContextTrieNode *> CalleeList;
1073 CalleeList.push(Caller);
1074 while (!CalleeList.empty()) {
1075 ContextTrieNode *Node = CalleeList.front();
1076 CalleeList.pop();
1077 FunctionSamples *CalleeSample = Node->getFunctionSamples();
1078 // For CSSPGO profile, retrieve candidate profile by walking over the
1079 // trie built for context profile. Note that also take call targets
1080 // even if callee doesn't have a corresponding context profile.
1081 if (!CalleeSample)
1082 continue;
1083
1084 // If pre-inliner decision is used, honor that for importing as well.
1085 bool PreInline =
1088 if (!PreInline && CalleeSample->getHeadSamplesEstimate() < Threshold)
1089 continue;
1090
1091 Function *Func = SymbolMap.lookup(CalleeSample->getFunction());
1092 // Add to the import list only when it's defined out of module.
1093 if (!Func || Func->isDeclaration())
1094 InlinedGUIDs.insert(CalleeSample->getGUID());
1095
1096 // Import hot CallTargets, which may not be available in IR because full
1097 // profile annotation cannot be done until backend compilation in ThinLTO.
1098 for (const auto &BS : CalleeSample->getBodySamples())
1099 for (const auto &TS : BS.second.getCallTargets())
1100 if (TS.second > Threshold) {
1101 const Function *Callee = SymbolMap.lookup(TS.first);
1102 if (!Callee || Callee->isDeclaration())
1103 InlinedGUIDs.insert(TS.first.getHashCode());
1104 }
1105
1106 // Import hot child context profile associted with callees. Note that this
1107 // may have some overlap with the call target loop above, but doing this
1108 // based child context profile again effectively allow us to use the max of
1109 // entry count and call target count to determine importing.
1110 for (auto &Child : Node->getAllChildContext()) {
1111 ContextTrieNode *CalleeNode = &Child.second;
1112 CalleeList.push(CalleeNode);
1113 }
1114 }
1115}
1116
1117/// Iteratively inline hot callsites of a function.
1118///
1119/// Iteratively traverse all callsites of the function \p F, so as to
1120/// find out callsites with corresponding inline instances.
1121///
1122/// For such callsites,
1123/// - If it is hot enough, inline the callsites and adds callsites of the callee
1124/// into the caller. If the call is an indirect call, first promote
1125/// it to direct call. Each indirect call is limited with a single target.
1126///
1127/// - If a callsite is not inlined, merge the its profile to the outline
1128/// version (if --sample-profile-merge-inlinee is true), or scale the
1129/// counters of standalone function based on the profile of inlined
1130/// instances (if --sample-profile-merge-inlinee is false).
1131///
1132/// Later passes may consume the updated profiles.
1133///
1134/// \param F function to perform iterative inlining.
1135/// \param InlinedGUIDs a set to be updated to include all GUIDs that are
1136/// inlined in the profiled binary.
1137///
1138/// \returns True if there is any inline happened.
1139bool SampleProfileLoader::inlineHotFunctions(
1140 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1141 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1142 // Profile symbol list is ignored when profile-sample-accurate is on.
1143 assert((!ProfAccForSymsInList ||
1145 !F.hasFnAttribute("profile-sample-accurate"))) &&
1146 "ProfAccForSymsInList should be false when profile-sample-accurate "
1147 "is enabled");
1148
1149 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1150 bool Changed = false;
1151 bool LocalChanged = true;
1152 while (LocalChanged) {
1153 LocalChanged = false;
1155 for (auto &BB : F) {
1156 bool Hot = false;
1157 SmallVector<CallBase *, 10> AllCandidates;
1158 SmallVector<CallBase *, 10> ColdCandidates;
1159 for (auto &I : BB) {
1160 const FunctionSamples *FS = nullptr;
1161 if (auto *CB = dyn_cast<CallBase>(&I)) {
1162 if (!isa<IntrinsicInst>(I)) {
1163 if ((FS = findCalleeFunctionSamples(*CB))) {
1164 assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
1165 "GUIDToFuncNameMap has to be populated");
1166 AllCandidates.push_back(CB);
1167 if (FS->getHeadSamplesEstimate() > 0 ||
1169 LocalNotInlinedCallSites.insert({CB, FS});
1170 if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1171 Hot = true;
1172 else if (shouldInlineColdCallee(*CB))
1173 ColdCandidates.push_back(CB);
1174 } else if (getExternalInlineAdvisorShouldInline(*CB)) {
1175 AllCandidates.push_back(CB);
1176 }
1177 }
1178 }
1179 }
1180 if (Hot || ExternalInlineAdvisor) {
1181 CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
1182 emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
1183 } else {
1184 CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
1185 emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
1186 }
1187 }
1188 for (CallBase *I : CIS) {
1189 Function *CalledFunction = I->getCalledFunction();
1190 InlineCandidate Candidate = {I, LocalNotInlinedCallSites.lookup(I),
1191 0 /* dummy count */,
1192 1.0 /* dummy distribution factor */};
1193 // Do not inline recursive calls.
1194 if (CalledFunction == &F)
1195 continue;
1196 if (I->isIndirectCall()) {
1197 uint64_t Sum;
1198 for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
1199 uint64_t SumOrigin = Sum;
1200 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1201 findExternalInlineCandidate(I, FS, InlinedGUIDs,
1202 PSI->getOrCompHotCountThreshold());
1203 continue;
1204 }
1205 if (!callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1206 continue;
1207
1208 Candidate = {I, FS, FS->getHeadSamplesEstimate(), 1.0};
1209 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) {
1210 LocalNotInlinedCallSites.erase(I);
1211 LocalChanged = true;
1212 }
1213 }
1214 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1215 !CalledFunction->isDeclaration()) {
1216 if (tryInlineCandidate(Candidate)) {
1217 LocalNotInlinedCallSites.erase(I);
1218 LocalChanged = true;
1219 }
1220 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1221 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1222 InlinedGUIDs,
1223 PSI->getOrCompHotCountThreshold());
1224 }
1225 }
1226 Changed |= LocalChanged;
1227 }
1228
1229 // For CS profile, profile for not inlined context will be merged when
1230 // base profile is being retrieved.
1232 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1233 return Changed;
1234}
1235
1236bool SampleProfileLoader::tryInlineCandidate(
1237 InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
1238 // Do not attempt to inline a candidate if
1239 // --disable-sample-loader-inlining is true.
1240 if (DisableSampleProfileInlining)
1241 return false;
1242
1243 CallBase &CB = *Candidate.CallInstr;
1244 Function *CalledFunction = CB.getCalledFunction();
1245 assert(CalledFunction && "Expect a callee with definition");
1246 DebugLoc DLoc = CB.getDebugLoc();
1247 BasicBlock *BB = CB.getParent();
1248
1249 InlineCost Cost = shouldInlineCandidate(Candidate);
1250 if (Cost.isNever()) {
1251 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1252 "InlineFail", DLoc, BB)
1253 << "incompatible inlining");
1254 return false;
1255 }
1256
1257 if (!Cost)
1258 return false;
1259
1260 InlineFunctionInfo IFI(GetAC);
1261 IFI.UpdateProfile = false;
1262 InlineResult IR = InlineFunction(CB, IFI,
1263 /*MergeAttributes=*/true);
1264 if (!IR.isSuccess())
1265 return false;
1266
1267 // The call to InlineFunction erases I, so we can't pass it here.
1268 emitInlinedIntoBasedOnCost(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(),
1269 Cost, true, getAnnotatedRemarkPassName());
1270
1271 // Now populate the list of newly exposed call sites.
1272 if (InlinedCallSites) {
1273 InlinedCallSites->clear();
1274 for (auto &I : IFI.InlinedCallSites)
1275 InlinedCallSites->push_back(I);
1276 }
1277
1279 ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
1280 ++NumCSInlined;
1281
1282 // Prorate inlined probes for a duplicated inlining callsite which probably
1283 // has a distribution less than 100%. Samples for an inlinee should be
1284 // distributed among the copies of the original callsite based on each
1285 // callsite's distribution factor for counts accuracy. Note that an inlined
1286 // probe may come with its own distribution factor if it has been duplicated
1287 // in the inlinee body. The two factor are multiplied to reflect the
1288 // aggregation of duplication.
1289 if (Candidate.CallsiteDistribution < 1) {
1290 for (auto &I : IFI.InlinedCallSites) {
1291 if (std::optional<PseudoProbe> Probe = extractProbe(*I))
1292 setProbeDistributionFactor(*I, Probe->Factor *
1293 Candidate.CallsiteDistribution);
1294 }
1295 NumDuplicatedInlinesite++;
1296 }
1297
1298 return true;
1299}
1300
1301bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
1302 CallBase *CB) {
1303 assert(CB && "Expect non-null call instruction");
1304
1305 if (isa<IntrinsicInst>(CB))
1306 return false;
1307
1308 // Find the callee's profile. For indirect call, find hottest target profile.
1309 const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
1310 // If ExternalInlineAdvisor wants to inline this site, do so even
1311 // if Samples are not present.
1312 if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(*CB))
1313 return false;
1314
1315 float Factor = 1.0;
1316 if (std::optional<PseudoProbe> Probe = extractProbe(*CB))
1317 Factor = Probe->Factor;
1318
1319 uint64_t CallsiteCount =
1320 CalleeSamples ? CalleeSamples->getHeadSamplesEstimate() * Factor : 0;
1321 *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
1322 return true;
1323}
1324
1325std::optional<InlineCost>
1326SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) {
1327 std::unique_ptr<InlineAdvice> Advice = nullptr;
1328 if (ExternalInlineAdvisor) {
1329 Advice = ExternalInlineAdvisor->getAdvice(CB);
1330 if (Advice) {
1331 if (!Advice->isInliningRecommended()) {
1332 Advice->recordUnattemptedInlining();
1333 return InlineCost::getNever("not previously inlined");
1334 }
1335 Advice->recordInlining();
1336 return InlineCost::getAlways("previously inlined");
1337 }
1338 }
1339
1340 return {};
1341}
1342
1343bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) {
1344 std::optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB);
1345 return Cost ? !!*Cost : false;
1346}
1347
1349SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
1350 if (std::optional<InlineCost> ReplayCost =
1351 getExternalInlineAdvisorCost(*Candidate.CallInstr))
1352 return *ReplayCost;
1353 // Adjust threshold based on call site hotness, only do this for callsite
1354 // prioritized inliner because otherwise cost-benefit check is done earlier.
1355 int SampleThreshold = SampleColdCallSiteThreshold;
1357 if (Candidate.CallsiteCount > PSI->getHotCountThreshold())
1358 SampleThreshold = SampleHotCallSiteThreshold;
1359 else if (!ProfileSizeInline)
1360 return InlineCost::getNever("cold callsite");
1361 }
1362
1363 Function *Callee = Candidate.CallInstr->getCalledFunction();
1364 assert(Callee && "Expect a definition for inline candidate of direct call");
1365
1366 InlineParams Params = getInlineParams();
1367 // We will ignore the threshold from inline cost, so always get full cost.
1368 Params.ComputeFullInlineCost = true;
1370 // Checks if there is anything in the reachable portion of the callee at
1371 // this callsite that makes this inlining potentially illegal. Need to
1372 // set ComputeFullInlineCost, otherwise getInlineCost may return early
1373 // when cost exceeds threshold without checking all IRs in the callee.
1374 // The acutal cost does not matter because we only checks isNever() to
1375 // see if it is legal to inline the callsite.
1376 InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
1377 GetTTI(*Callee), GetAC, GetTLI);
1378
1379 // Honor always inline and never inline from call analyzer
1380 if (Cost.isNever() || Cost.isAlways())
1381 return Cost;
1382
1383 // With CSSPGO, the preinliner in llvm-profgen can estimate global inline
1384 // decisions based on hotness as well as accurate function byte sizes for
1385 // given context using function/inlinee sizes from previous build. It
1386 // stores the decision in profile, and also adjust/merge context profile
1387 // aiming at better context-sensitive post-inline profile quality, assuming
1388 // all inline decision estimates are going to be honored by compiler. Here
1389 // we replay that inline decision under `sample-profile-use-preinliner`.
1390 // Note that we don't need to handle negative decision from preinliner as
1391 // context profile for not inlined calls are merged by preinliner already.
1392 if (UsePreInlinerDecision && Candidate.CalleeSamples) {
1393 // Once two node are merged due to promotion, we're losing some context
1394 // so the original context-sensitive preinliner decision should be ignored
1395 // for SyntheticContext.
1396 SampleContext &Context = Candidate.CalleeSamples->getContext();
1397 if (!Context.hasState(SyntheticContext) &&
1399 return InlineCost::getAlways("preinliner");
1400 }
1401
1402 // For old FDO inliner, we inline the call site if it is below hot threshold,
1403 // even if the function is hot based on sample profile data. This is to
1404 // prevent huge functions from being inlined.
1407 }
1408
1409 // Otherwise only use the cost from call analyzer, but overwite threshold with
1410 // Sample PGO threshold.
1411 return InlineCost::get(Cost.getCost(), SampleThreshold);
1412}
1413
1414bool SampleProfileLoader::inlineHotFunctionsWithPriority(
1415 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1416 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1417 // Profile symbol list is ignored when profile-sample-accurate is on.
1418 assert((!ProfAccForSymsInList ||
1420 !F.hasFnAttribute("profile-sample-accurate"))) &&
1421 "ProfAccForSymsInList should be false when profile-sample-accurate "
1422 "is enabled");
1423
1424 // Populating worklist with initial call sites from root inliner, along
1425 // with call site weights.
1426 CandidateQueue CQueue;
1427 InlineCandidate NewCandidate;
1428 for (auto &BB : F) {
1429 for (auto &I : BB) {
1430 auto *CB = dyn_cast<CallBase>(&I);
1431 if (!CB)
1432 continue;
1433 if (getInlineCandidate(&NewCandidate, CB))
1434 CQueue.push(NewCandidate);
1435 }
1436 }
1437
1438 // Cap the size growth from profile guided inlining. This is needed even
1439 // though cost of each inline candidate already accounts for callee size,
1440 // because with top-down inlining, we can grow inliner size significantly
1441 // with large number of smaller inlinees each pass the cost check.
1443 "Max inline size limit should not be smaller than min inline size "
1444 "limit.");
1445 unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;
1446 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
1447 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
1448 if (ExternalInlineAdvisor)
1449 SizeLimit = std::numeric_limits<unsigned>::max();
1450
1451 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1452
1453 // Perform iterative BFS call site prioritized inlining
1454 bool Changed = false;
1455 while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
1456 InlineCandidate Candidate = CQueue.top();
1457 CQueue.pop();
1458 CallBase *I = Candidate.CallInstr;
1459 Function *CalledFunction = I->getCalledFunction();
1460
1461 if (CalledFunction == &F)
1462 continue;
1463 if (I->isIndirectCall()) {
1464 uint64_t Sum = 0;
1465 auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
1466 uint64_t SumOrigin = Sum;
1467 Sum *= Candidate.CallsiteDistribution;
1468 unsigned ICPCount = 0;
1469 for (const auto *FS : CalleeSamples) {
1470 // TODO: Consider disable pre-lTO ICP for MonoLTO as well
1471 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1472 findExternalInlineCandidate(I, FS, InlinedGUIDs,
1473 PSI->getOrCompHotCountThreshold());
1474 continue;
1475 }
1476 uint64_t EntryCountDistributed =
1477 FS->getHeadSamplesEstimate() * Candidate.CallsiteDistribution;
1478 // In addition to regular inline cost check, we also need to make sure
1479 // ICP isn't introducing excessive speculative checks even if individual
1480 // target looks beneficial to promote and inline. That means we should
1481 // only do ICP when there's a small number dominant targets.
1482 if (ICPCount >= ProfileICPRelativeHotnessSkip &&
1483 EntryCountDistributed * 100 < SumOrigin * ProfileICPRelativeHotness)
1484 break;
1485 // TODO: Fix CallAnalyzer to handle all indirect calls.
1486 // For indirect call, we don't run CallAnalyzer to get InlineCost
1487 // before actual inlining. This is because we could see two different
1488 // types from the same definition, which makes CallAnalyzer choke as
1489 // it's expecting matching parameter type on both caller and callee
1490 // side. See example from PR18962 for the triggering cases (the bug was
1491 // fixed, but we generate different types).
1492 if (!PSI->isHotCount(EntryCountDistributed))
1493 break;
1494 SmallVector<CallBase *, 8> InlinedCallSites;
1495 // Attach function profile for promoted indirect callee, and update
1496 // call site count for the promoted inline candidate too.
1497 Candidate = {I, FS, EntryCountDistributed,
1498 Candidate.CallsiteDistribution};
1499 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
1500 &InlinedCallSites)) {
1501 for (auto *CB : InlinedCallSites) {
1502 if (getInlineCandidate(&NewCandidate, CB))
1503 CQueue.emplace(NewCandidate);
1504 }
1505 ICPCount++;
1506 Changed = true;
1507 } else if (!ContextTracker) {
1508 LocalNotInlinedCallSites.insert({I, FS});
1509 }
1510 }
1511 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1512 !CalledFunction->isDeclaration()) {
1513 SmallVector<CallBase *, 8> InlinedCallSites;
1514 if (tryInlineCandidate(Candidate, &InlinedCallSites)) {
1515 for (auto *CB : InlinedCallSites) {
1516 if (getInlineCandidate(&NewCandidate, CB))
1517 CQueue.emplace(NewCandidate);
1518 }
1519 Changed = true;
1520 } else if (!ContextTracker) {
1521 LocalNotInlinedCallSites.insert({I, Candidate.CalleeSamples});
1522 }
1523 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1524 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1525 InlinedGUIDs,
1526 PSI->getOrCompHotCountThreshold());
1527 }
1528 }
1529
1530 if (!CQueue.empty()) {
1531 if (SizeLimit == (unsigned)ProfileInlineLimitMax)
1532 ++NumCSInlinedHitMaxLimit;
1533 else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
1534 ++NumCSInlinedHitMinLimit;
1535 else
1536 ++NumCSInlinedHitGrowthLimit;
1537 }
1538
1539 // For CS profile, profile for not inlined context will be merged when
1540 // base profile is being retrieved.
1542 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1543 return Changed;
1544}
1545
1546void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
1548 const Function &F) {
1549 // Accumulate not inlined callsite information into notInlinedSamples
1550 for (const auto &Pair : NonInlinedCallSites) {
1551 CallBase *I = Pair.first;
1552 Function *Callee = I->getCalledFunction();
1553 if (!Callee || Callee->isDeclaration())
1554 continue;
1555
1556 ORE->emit(
1557 OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), "NotInline",
1558 I->getDebugLoc(), I->getParent())
1559 << "previous inlining not repeated: '" << ore::NV("Callee", Callee)
1560 << "' into '" << ore::NV("Caller", &F) << "'");
1561
1562 ++NumCSNotInlined;
1563 const FunctionSamples *FS = Pair.second;
1564 if (FS->getTotalSamples() == 0 && FS->getHeadSamplesEstimate() == 0) {
1565 continue;
1566 }
1567
1568 // Do not merge a context that is already duplicated into the base profile.
1569 if (FS->getContext().hasAttribute(sampleprof::ContextDuplicatedIntoBase))
1570 continue;
1571
1572 if (ProfileMergeInlinee) {
1573 // A function call can be replicated by optimizations like callsite
1574 // splitting or jump threading and the replicates end up sharing the
1575 // sample nested callee profile instead of slicing the original
1576 // inlinee's profile. We want to do merge exactly once by filtering out
1577 // callee profiles with a non-zero head sample count.
1578 if (FS->getHeadSamples() == 0) {
1579 // Use entry samples as head samples during the merge, as inlinees
1580 // don't have head samples.
1581 const_cast<FunctionSamples *>(FS)->addHeadSamples(
1582 FS->getHeadSamplesEstimate());
1583
1584 // Note that we have to do the merge right after processing function.
1585 // This allows OutlineFS's profile to be used for annotation during
1586 // top-down processing of functions' annotation.
1587 FunctionSamples *OutlineFS = Reader->getSamplesFor(*Callee);
1588 // If outlined function does not exist in the profile, add it to a
1589 // separate map so that it does not rehash the original profile.
1590 if (!OutlineFS)
1591 OutlineFS = &OutlineFunctionSamples[
1593 OutlineFS->merge(*FS, 1);
1594 // Set outlined profile to be synthetic to not bias the inliner.
1595 OutlineFS->setContextSynthetic();
1596 }
1597 } else {
1598 auto pair =
1599 notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
1600 pair.first->second.entryCount += FS->getHeadSamplesEstimate();
1601 }
1602 }
1603}
1604
1605/// Returns the sorted CallTargetMap \p M by count in descending order.
1609 for (const auto &I : SampleRecord::sortCallTargets(M)) {
1610 R.emplace_back(
1611 InstrProfValueData{I.first.getHashCode(), I.second});
1612 }
1613 return R;
1614}
1615
1616// Generate MD_prof metadata for every branch instruction using the
1617// edge weights computed during propagation.
1618void SampleProfileLoader::generateMDProfMetadata(Function &F) {
1619 // Generate MD_prof metadata for every branch instruction using the
1620 // edge weights computed during propagation.
1621 LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
1622 LLVMContext &Ctx = F.getContext();
1623 MDBuilder MDB(Ctx);
1624 for (auto &BI : F) {
1625 BasicBlock *BB = &BI;
1626
1627 if (BlockWeights[BB]) {
1628 for (auto &I : *BB) {
1629 if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
1630 continue;
1631 if (!cast<CallBase>(I).getCalledFunction()) {
1632 const DebugLoc &DLoc = I.getDebugLoc();
1633 if (!DLoc)
1634 continue;
1635 const DILocation *DIL = DLoc;
1636 const FunctionSamples *FS = findFunctionSamples(I);
1637 if (!FS)
1638 continue;
1641 FS->findCallTargetMapAt(CallSite);
1642 if (!T || T.get().empty())
1643 continue;
1645 // Prorate the callsite counts based on the pre-ICP distribution
1646 // factor to reflect what is already done to the callsite before
1647 // ICP, such as calliste cloning.
1648 if (std::optional<PseudoProbe> Probe = extractProbe(I)) {
1649 if (Probe->Factor < 1)
1650 T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
1651 }
1652 }
1653 SmallVector<InstrProfValueData, 2> SortedCallTargets =
1655 uint64_t Sum = 0;
1656 for (const auto &C : T.get())
1657 Sum += C.second;
1658 // With CSSPGO all indirect call targets are counted torwards the
1659 // original indirect call site in the profile, including both
1660 // inlined and non-inlined targets.
1662 if (const FunctionSamplesMap *M =
1663 FS->findFunctionSamplesMapAt(CallSite)) {
1664 for (const auto &NameFS : *M)
1665 Sum += NameFS.second.getHeadSamplesEstimate();
1666 }
1667 }
1668 if (Sum)
1669 updateIDTMetaData(I, SortedCallTargets, Sum);
1670 else if (OverwriteExistingWeights)
1671 I.setMetadata(LLVMContext::MD_prof, nullptr);
1672 } else if (!isa<IntrinsicInst>(&I)) {
1673 setBranchWeights(I, {static_cast<uint32_t>(BlockWeights[BB])},
1674 /*IsExpected=*/false);
1675 }
1676 }
1678 // Set profile metadata (possibly annotated by LTO prelink) to zero or
1679 // clear it for cold code.
1680 for (auto &I : *BB) {
1681 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1682 if (cast<CallBase>(I).isIndirectCall()) {
1683 I.setMetadata(LLVMContext::MD_prof, nullptr);
1684 } else {
1685 setBranchWeights(I, {uint32_t(0)}, /*IsExpected=*/false);
1686 }
1687 }
1688 }
1689 }
1690
1691 Instruction *TI = BB->getTerminator();
1692 if (TI->getNumSuccessors() == 1)
1693 continue;
1694 if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI) &&
1695 !isa<IndirectBrInst>(TI))
1696 continue;
1697
1698 DebugLoc BranchLoc = TI->getDebugLoc();
1699 LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
1700 << ((BranchLoc) ? Twine(BranchLoc.getLine())
1701 : Twine("<UNKNOWN LOCATION>"))
1702 << ".\n");
1704 uint32_t MaxWeight = 0;
1705 Instruction *MaxDestInst;
1706 // Since profi treats multiple edges (multiway branches) as a single edge,
1707 // we need to distribute the computed weight among the branches. We do
1708 // this by evenly splitting the edge weight among destinations.
1710 std::vector<uint64_t> EdgeIndex;
1712 EdgeIndex.resize(TI->getNumSuccessors());
1713 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1714 const BasicBlock *Succ = TI->getSuccessor(I);
1715 EdgeIndex[I] = EdgeMultiplicity[Succ];
1716 EdgeMultiplicity[Succ]++;
1717 }
1718 }
1719 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1720 BasicBlock *Succ = TI->getSuccessor(I);
1721 Edge E = std::make_pair(BB, Succ);
1722 uint64_t Weight = EdgeWeights[E];
1723 LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
1724 // Use uint32_t saturated arithmetic to adjust the incoming weights,
1725 // if needed. Sample counts in profiles are 64-bit unsigned values,
1726 // but internally branch weights are expressed as 32-bit values.
1727 if (Weight > std::numeric_limits<uint32_t>::max()) {
1728 LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)\n");
1729 Weight = std::numeric_limits<uint32_t>::max();
1730 }
1731 if (!SampleProfileUseProfi) {
1732 // Weight is added by one to avoid propagation errors introduced by
1733 // 0 weights.
1734 Weights.push_back(static_cast<uint32_t>(
1735 Weight == std::numeric_limits<uint32_t>::max() ? Weight
1736 : Weight + 1));
1737 } else {
1738 // Profi creates proper weights that do not require "+1" adjustments but
1739 // we evenly split the weight among branches with the same destination.
1740 uint64_t W = Weight / EdgeMultiplicity[Succ];
1741 // Rounding up, if needed, so that first branches are hotter.
1742 if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ])
1743 W++;
1744 Weights.push_back(static_cast<uint32_t>(W));
1745 }
1746 if (Weight != 0) {
1747 if (Weight > MaxWeight) {
1748 MaxWeight = Weight;
1749 MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
1750 }
1751 }
1752 }
1753
1754 misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);
1755
1756 uint64_t TempWeight;
1757 // Only set weights if there is at least one non-zero weight.
1758 // In any other case, let the analyzer set weights.
1759 // Do not set weights if the weights are present unless under
1760 // OverwriteExistingWeights. In ThinLTO, the profile annotation is done
1761 // twice. If the first annotation already set the weights, the second pass
1762 // does not need to set it. With OverwriteExistingWeights, Blocks with zero
1763 // weight should have their existing metadata (possibly annotated by LTO
1764 // prelink) cleared.
1765 if (MaxWeight > 0 &&
1766 (!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) {
1767 LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
1768 setBranchWeights(*TI, Weights, /*IsExpected=*/false);
1769 ORE->emit([&]() {
1770 return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
1771 << "most popular destination for conditional branches at "
1772 << ore::NV("CondBranchesLoc", BranchLoc);
1773 });
1774 } else {
1776 TI->setMetadata(LLVMContext::MD_prof, nullptr);
1777 LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n");
1778 } else {
1779 LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
1780 }
1781 }
1782 }
1783}
1784
1785/// Once all the branch weights are computed, we emit the MD_prof
1786/// metadata on BB using the computed values for each of its branches.
1787///
1788/// \param F The function to query.
1789///
1790/// \returns true if \p F was modified. Returns false, otherwise.
1791bool SampleProfileLoader::emitAnnotations(Function &F) {
1792 bool Changed = false;
1793
1795 LLVM_DEBUG({
1796 if (!ProbeManager->getDesc(F))
1797 dbgs() << "Probe descriptor missing for Function " << F.getName()
1798 << "\n";
1799 });
1800
1801 if (ProbeManager->profileIsValid(F, *Samples)) {
1802 ++NumMatchedProfile;
1803 } else {
1804 ++NumMismatchedProfile;
1805 LLVM_DEBUG(
1806 dbgs() << "Profile is invalid due to CFG mismatch for Function "
1807 << F.getName() << "\n");
1809 return false;
1810 }
1811 } else {
1812 if (getFunctionLoc(F) == 0)
1813 return false;
1814
1815 LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
1816 << F.getName() << ": " << getFunctionLoc(F) << "\n");
1817 }
1818
1819 DenseSet<GlobalValue::GUID> InlinedGUIDs;
1821 Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
1822 else
1823 Changed |= inlineHotFunctions(F, InlinedGUIDs);
1824
1825 Changed |= computeAndPropagateWeights(F, InlinedGUIDs);
1826
1827 if (Changed)
1828 generateMDProfMetadata(F);
1829
1830 emitCoverageRemarks(F);
1831 return Changed;
1832}
1833
1834std::unique_ptr<ProfiledCallGraph>
1835SampleProfileLoader::buildProfiledCallGraph(Module &M) {
1836 std::unique_ptr<ProfiledCallGraph> ProfiledCG;
1838 ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker);
1839 else
1840 ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles());
1841
1842 // Add all functions into the profiled call graph even if they are not in
1843 // the profile. This makes sure functions missing from the profile still
1844 // gets a chance to be processed.
1845 for (Function &F : M) {
1847 continue;
1848 ProfiledCG->addProfiledFunction(
1850 }
1851
1852 return ProfiledCG;
1853}
1854
1855std::vector<Function *>
1856SampleProfileLoader::buildFunctionOrder(Module &M, LazyCallGraph &CG) {
1857 std::vector<Function *> FunctionOrderList;
1858 FunctionOrderList.reserve(M.size());
1859
1861 errs() << "WARNING: -use-profiled-call-graph ignored, should be used "
1862 "together with -sample-profile-top-down-load.\n";
1863
1864 if (!ProfileTopDownLoad) {
1865 if (ProfileMergeInlinee) {
1866 // Disable ProfileMergeInlinee if profile is not loaded in top down order,
1867 // because the profile for a function may be used for the profile
1868 // annotation of its outline copy before the profile merging of its
1869 // non-inlined inline instances, and that is not the way how
1870 // ProfileMergeInlinee is supposed to work.
1871 ProfileMergeInlinee = false;
1872 }
1873
1874 for (Function &F : M)
1876 FunctionOrderList.push_back(&F);
1877 return FunctionOrderList;
1878 }
1879
1881 !UseProfiledCallGraph.getNumOccurrences())) {
1882 // Use profiled call edges to augment the top-down order. There are cases
1883 // that the top-down order computed based on the static call graph doesn't
1884 // reflect real execution order. For example
1885 //
1886 // 1. Incomplete static call graph due to unknown indirect call targets.
1887 // Adjusting the order by considering indirect call edges from the
1888 // profile can enable the inlining of indirect call targets by allowing
1889 // the caller processed before them.
1890 // 2. Mutual call edges in an SCC. The static processing order computed for
1891 // an SCC may not reflect the call contexts in the context-sensitive
1892 // profile, thus may cause potential inlining to be overlooked. The
1893 // function order in one SCC is being adjusted to a top-down order based
1894 // on the profile to favor more inlining. This is only a problem with CS
1895 // profile.
1896 // 3. Transitive indirect call edges due to inlining. When a callee function
1897 // (say B) is inlined into a caller function (say A) in LTO prelink,
1898 // every call edge originated from the callee B will be transferred to
1899 // the caller A. If any transferred edge (say A->C) is indirect, the
1900 // original profiled indirect edge B->C, even if considered, would not
1901 // enforce a top-down order from the caller A to the potential indirect
1902 // call target C in LTO postlink since the inlined callee B is gone from
1903 // the static call graph.
1904 // 4. #3 can happen even for direct call targets, due to functions defined
1905 // in header files. A header function (say A), when included into source
1906 // files, is defined multiple times but only one definition survives due
1907 // to ODR. Therefore, the LTO prelink inlining done on those dropped
1908 // definitions can be useless based on a local file scope. More
1909 // importantly, the inlinee (say B), once fully inlined to a
1910 // to-be-dropped A, will have no profile to consume when its outlined
1911 // version is compiled. This can lead to a profile-less prelink
1912 // compilation for the outlined version of B which may be called from
1913 // external modules. while this isn't easy to fix, we rely on the
1914 // postlink AutoFDO pipeline to optimize B. Since the survived copy of
1915 // the A can be inlined in its local scope in prelink, it may not exist
1916 // in the merged IR in postlink, and we'll need the profiled call edges
1917 // to enforce a top-down order for the rest of the functions.
1918 //
1919 // Considering those cases, a profiled call graph completely independent of
1920 // the static call graph is constructed based on profile data, where
1921 // function objects are not even needed to handle case #3 and case 4.
1922 //
1923 // Note that static callgraph edges are completely ignored since they
1924 // can be conflicting with profiled edges for cyclic SCCs and may result in
1925 // an SCC order incompatible with profile-defined one. Using strictly
1926 // profile order ensures a maximum inlining experience. On the other hand,
1927 // static call edges are not so important when they don't correspond to a
1928 // context in the profile.
1929
1930 std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(M);
1931 scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());
1932 while (!CGI.isAtEnd()) {
1933 auto Range = *CGI;
1934 if (SortProfiledSCC) {
1935 // Sort nodes in one SCC based on callsite hotness.
1937 Range = *SI;
1938 }
1939 for (auto *Node : Range) {
1940 Function *F = SymbolMap.lookup(Node->Name);
1941 if (F && !skipProfileForFunction(*F))
1942 FunctionOrderList.push_back(F);
1943 }
1944 ++CGI;
1945 }
1946 std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
1947 } else
1948 buildTopDownFuncOrder(CG, FunctionOrderList);
1949
1950 LLVM_DEBUG({
1951 dbgs() << "Function processing order:\n";
1952 for (auto F : FunctionOrderList) {
1953 dbgs() << F->getName() << "\n";
1954 }
1955 });
1956
1957 return FunctionOrderList;
1958}
1959
1960bool SampleProfileLoader::doInitialization(Module &M,
1962 auto &Ctx = M.getContext();
1963
1964 auto ReaderOrErr = SampleProfileReader::create(
1965 Filename, Ctx, *FS, FSDiscriminatorPass::Base, RemappingFilename);
1966 if (std::error_code EC = ReaderOrErr.getError()) {
1967 std::string Msg = "Could not open profile: " + EC.message();
1968 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1969 return false;
1970 }
1971 Reader = std::move(ReaderOrErr.get());
1972 Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink);
1973 // set module before reading the profile so reader may be able to only
1974 // read the function profiles which are used by the current module.
1975 Reader->setModule(&M);
1976 if (std::error_code EC = Reader->read()) {
1977 std::string Msg = "profile reading failed: " + EC.message();
1978 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1979 return false;
1980 }
1981
1982 PSL = Reader->getProfileSymbolList();
1983
1984 if (DisableSampleLoaderInlining.getNumOccurrences())
1985 DisableSampleProfileInlining = DisableSampleLoaderInlining;
1986
1987 if (UseFlattenedProfile)
1988 ProfileConverter::flattenProfile(Reader->getProfiles(),
1989 Reader->profileIsCS());
1990
1991 // While profile-sample-accurate is on, ignore symbol list.
1992 ProfAccForSymsInList =
1994 if (ProfAccForSymsInList) {
1995 NamesInProfile.clear();
1996 GUIDsInProfile.clear();
1997 if (auto NameTable = Reader->getNameTable()) {
1999 for (auto Name : *NameTable)
2000 GUIDsInProfile.insert(Name.getHashCode());
2001 } else {
2002 for (auto Name : *NameTable)
2003 NamesInProfile.insert(Name.stringRef());
2004 }
2005 }
2006 CoverageTracker.setProfAccForSymsInList(true);
2007 }
2008
2009 if (FAM && !ProfileInlineReplayFile.empty()) {
2010 ExternalInlineAdvisor = getReplayInlineAdvisor(
2011 M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr,
2016 /*EmitRemarks=*/false, InlineContext{LTOPhase, InlinePass::ReplaySampleProfileInliner});
2017 }
2018
2019 // Apply tweaks if context-sensitive or probe-based profile is available.
2020 if (Reader->profileIsCS() || Reader->profileIsPreInlined() ||
2021 Reader->profileIsProbeBased()) {
2022 if (!UseIterativeBFIInference.getNumOccurrences())
2024 if (!SampleProfileUseProfi.getNumOccurrences())
2025 SampleProfileUseProfi = true;
2026 if (!EnableExtTspBlockPlacement.getNumOccurrences())
2028 // Enable priority-base inliner and size inline by default for CSSPGO.
2029 if (!ProfileSizeInline.getNumOccurrences())
2030 ProfileSizeInline = true;
2031 if (!CallsitePrioritizedInline.getNumOccurrences())
2033 // For CSSPGO, we also allow recursive inline to best use context profile.
2034 if (!AllowRecursiveInline.getNumOccurrences())
2035 AllowRecursiveInline = true;
2036
2037 if (Reader->profileIsPreInlined()) {
2038 if (!UsePreInlinerDecision.getNumOccurrences())
2039 UsePreInlinerDecision = true;
2040 }
2041
2042 // Enable stale profile matching by default for probe-based profile.
2043 // Currently the matching relies on if the checksum mismatch is detected,
2044 // which is currently only available for pseudo-probe mode. Removing the
2045 // checksum check could cause regressions for some cases, so further tuning
2046 // might be needed if we want to enable it for all cases.
2047 if (Reader->profileIsProbeBased() &&
2048 !SalvageStaleProfile.getNumOccurrences()) {
2049 SalvageStaleProfile = true;
2050 }
2051
2052 if (!Reader->profileIsCS()) {
2053 // Non-CS profile should be fine without a function size budget for the
2054 // inliner since the contexts in the profile are either all from inlining
2055 // in the prevoius build or pre-computed by the preinliner with a size
2056 // cap, thus they are bounded.
2057 if (!ProfileInlineLimitMin.getNumOccurrences())
2058 ProfileInlineLimitMin = std::numeric_limits<unsigned>::max();
2059 if (!ProfileInlineLimitMax.getNumOccurrences())
2060 ProfileInlineLimitMax = std::numeric_limits<unsigned>::max();
2061 }
2062 }
2063
2064 if (Reader->profileIsCS()) {
2065 // Tracker for profiles under different context
2066 ContextTracker = std::make_unique<SampleContextTracker>(
2067 Reader->getProfiles(), &GUIDToFuncNameMap);
2068 }
2069
2070 // Load pseudo probe descriptors for probe-based function samples.
2071 if (Reader->profileIsProbeBased()) {
2072 ProbeManager = std::make_unique<PseudoProbeManager>(M);
2073 if (!ProbeManager->moduleIsProbed(M)) {
2074 const char *Msg =
2075 "Pseudo-probe-based profile requires SampleProfileProbePass";
2076 Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg,
2077 DS_Warning));
2078 return false;
2079 }
2080 }
2081
2084 MatchingManager = std::make_unique<SampleProfileMatcher>(
2085 M, *Reader, CG, ProbeManager.get(), LTOPhase, SymbolMap, PSL,
2086 FuncNameToProfNameMap);
2087 }
2088
2089 return true;
2090}
2091
2092// Note that this is a module-level check. Even if one module is errored out,
2093// the entire build will be errored out. However, the user could make big
2094// changes to functions in single module but those changes might not be
2095// performance significant to the whole binary. Therefore, to avoid those false
2096// positives, we select a reasonable big set of hot functions that are supposed
2097// to be globally performance significant, only compute and check the mismatch
2098// within those functions. The function selection is based on two criteria:
2099// 1) The function is hot enough, which is tuned by a hotness-based
2100// flag(HotFuncCutoffForStalenessError). 2) The num of function is large enough
2101// which is tuned by the MinfuncsForStalenessError flag.
2102bool SampleProfileLoader::rejectHighStalenessProfile(
2103 Module &M, ProfileSummaryInfo *PSI, const SampleProfileMap &Profiles) {
2105 "Only support for probe-based profile");
2106 uint64_t TotalHotFunc = 0;
2107 uint64_t NumMismatchedFunc = 0;
2108 for (const auto &I : Profiles) {
2109 const auto &FS = I.second;
2110 const auto *FuncDesc = ProbeManager->getDesc(FS.getGUID());
2111 if (!FuncDesc)
2112 continue;
2113
2114 // Use a hotness-based threshold to control the function selection.
2116 FS.getTotalSamples()))
2117 continue;
2118
2119 TotalHotFunc++;
2120 if (ProbeManager->profileIsHashMismatched(*FuncDesc, FS))
2121 NumMismatchedFunc++;
2122 }
2123 // Make sure that the num of selected function is not too small to distinguish
2124 // from the user's benign changes.
2125 if (TotalHotFunc < MinfuncsForStalenessError)
2126 return false;
2127
2128 // Finally check the mismatch percentage against the threshold.
2129 if (NumMismatchedFunc * 100 >=
2130 TotalHotFunc * PrecentMismatchForStalenessError) {
2131 auto &Ctx = M.getContext();
2132 const char *Msg =
2133 "The input profile significantly mismatches current source code. "
2134 "Please recollect profile to avoid performance regression.";
2135 Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg));
2136 return true;
2137 }
2138 return false;
2139}
2140
2141void SampleProfileLoader::removePseudoProbeInstsDiscriminator(Module &M) {
2142 for (auto &F : M) {
2143 std::vector<Instruction *> InstsToDel;
2144 for (auto &BB : F) {
2145 for (auto &I : BB) {
2146 if (isa<PseudoProbeInst>(&I))
2147 InstsToDel.push_back(&I);
2148 else if (isa<CallBase>(&I))
2149 if (const DILocation *DIL = I.getDebugLoc().get()) {
2150 // Restore dwarf discriminator for call.
2151 unsigned Discriminator = DIL->getDiscriminator();
2152 if (DILocation::isPseudoProbeDiscriminator(Discriminator)) {
2153 std::optional<uint32_t> DwarfDiscriminator =
2155 Discriminator);
2156 I.setDebugLoc(DIL->cloneWithDiscriminator(
2157 DwarfDiscriminator ? *DwarfDiscriminator : 0));
2158 }
2159 }
2160 }
2161 }
2162 for (auto *I : InstsToDel)
2163 I->eraseFromParent();
2164 }
2165}
2166
2167bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
2168 ProfileSummaryInfo *_PSI) {
2169 GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
2170
2171 PSI = _PSI;
2172 if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
2173 M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
2175 PSI->refresh();
2176 }
2177
2179 rejectHighStalenessProfile(M, PSI, Reader->getProfiles()))
2180 return false;
2181
2182 // Compute the total number of samples collected in this profile.
2183 for (const auto &I : Reader->getProfiles())
2184 TotalCollectedSamples += I.second.getTotalSamples();
2185
2186 auto Remapper = Reader->getRemapper();
2187 // Populate the symbol map.
2188 for (const auto &N_F : M.getValueSymbolTable()) {
2189 StringRef OrigName = N_F.getKey();
2190 Function *F = dyn_cast<Function>(N_F.getValue());
2191 if (F == nullptr || OrigName.empty())
2192 continue;
2193 SymbolMap[FunctionId(OrigName)] = F;
2195 if (OrigName != NewName && !NewName.empty()) {
2196 auto r = SymbolMap.emplace(FunctionId(NewName), F);
2197 // Failiing to insert means there is already an entry in SymbolMap,
2198 // thus there are multiple functions that are mapped to the same
2199 // stripped name. In this case of name conflicting, set the value
2200 // to nullptr to avoid confusion.
2201 if (!r.second)
2202 r.first->second = nullptr;
2203 OrigName = NewName;
2204 }
2205 // Insert the remapped names into SymbolMap.
2206 if (Remapper) {
2207 if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
2208 if (*MapName != OrigName && !MapName->empty())
2209 SymbolMap.emplace(FunctionId(*MapName), F);
2210 }
2211 }
2212 }
2213
2214 // Stale profile matching.
2217 MatchingManager->runOnModule();
2218 MatchingManager->clearMatchingData();
2219 }
2220 assert(SymbolMap.count(FunctionId()) == 0 &&
2221 "No empty StringRef should be added in SymbolMap");
2222 assert((SalvageUnusedProfile || FuncNameToProfNameMap.empty()) &&
2223 "FuncNameToProfNameMap is not empty when --salvage-unused-profile is "
2224 "not enabled");
2225
2226 bool retval = false;
2227 for (auto *F : buildFunctionOrder(M, CG)) {
2228 assert(!F->isDeclaration());
2229 clearFunctionData();
2230 retval |= runOnFunction(*F, AM);
2231 }
2232
2233 // Account for cold calls not inlined....
2235 for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
2236 notInlinedCallInfo)
2237 updateProfileCallee(pair.first, pair.second.entryCount);
2238
2241 removePseudoProbeInstsDiscriminator(M);
2242 if (auto *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName))
2243 M.eraseNamedMetadata(FuncInfo);
2244 }
2245
2246 return retval;
2247}
2248
2249bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
2250 LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
2251 DILocation2SampleMap.clear();
2252 // By default the entry count is initialized to -1, which will be treated
2253 // conservatively by getEntryCount as the same as unknown (None). This is
2254 // to avoid newly added code to be treated as cold. If we have samples
2255 // this will be overwritten in emitAnnotations.
2256 uint64_t initialEntryCount = -1;
2257
2258 ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
2259 if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
2260 // initialize all the function entry counts to 0. It means all the
2261 // functions without profile will be regarded as cold.
2262 initialEntryCount = 0;
2263 // profile-sample-accurate is a user assertion which has a higher precedence
2264 // than symbol list. When profile-sample-accurate is on, ignore symbol list.
2265 ProfAccForSymsInList = false;
2266 }
2267 CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList);
2268
2269 // PSL -- profile symbol list include all the symbols in sampled binary.
2270 // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
2271 // old functions without samples being cold, without having to worry
2272 // about new and hot functions being mistakenly treated as cold.
2273 if (ProfAccForSymsInList) {
2274 // Initialize the entry count to 0 for functions in the list.
2275 if (PSL->contains(F.getName()))
2276 initialEntryCount = 0;
2277
2278 // Function in the symbol list but without sample will be regarded as
2279 // cold. To minimize the potential negative performance impact it could
2280 // have, we want to be a little conservative here saying if a function
2281 // shows up in the profile, no matter as outline function, inline instance
2282 // or call targets, treat the function as not being cold. This will handle
2283 // the cases such as most callsites of a function are inlined in sampled
2284 // binary but not inlined in current build (because of source code drift,
2285 // imprecise debug information, or the callsites are all cold individually
2286 // but not cold accumulatively...), so the outline function showing up as
2287 // cold in sampled binary will actually not be cold after current build.
2290 GUIDsInProfile.count(Function::getGUID(CanonName))) ||
2291 (!FunctionSamples::UseMD5 && NamesInProfile.count(CanonName)))
2292 initialEntryCount = -1;
2293 }
2294
2295 // Initialize entry count when the function has no existing entry
2296 // count value.
2297 if (!F.getEntryCount())
2298 F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
2299 std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
2300 if (AM) {
2301 auto &FAM =
2303 .getManager();
2305 } else {
2306 OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
2307 ORE = OwnedORE.get();
2308 }
2309
2311 Samples = ContextTracker->getBaseSamplesFor(F);
2312 else {
2313 Samples = Reader->getSamplesFor(F);
2314 // Try search in previously inlined functions that were split or duplicated
2315 // into base.
2316 if (!Samples) {
2318 auto It = OutlineFunctionSamples.find(FunctionId(CanonName));
2319 if (It != OutlineFunctionSamples.end()) {
2320 Samples = &It->second;
2321 } else if (auto Remapper = Reader->getRemapper()) {
2322 if (auto RemppedName = Remapper->lookUpNameInProfile(CanonName)) {
2323 It = OutlineFunctionSamples.find(FunctionId(*RemppedName));
2324 if (It != OutlineFunctionSamples.end())
2325 Samples = &It->second;
2326 }
2327 }
2328 }
2329 }
2330
2331 if (Samples && !Samples->empty())
2332 return emitAnnotations(F);
2333 return false;
2334}
2336 std::string File, std::string RemappingFile, ThinOrFullLTOPhase LTOPhase,
2337 IntrusiveRefCntPtr<vfs::FileSystem> FS, bool DisableSampleProfileInlining,
2338 bool UseFlattenedProfile)
2339 : ProfileFileName(File), ProfileRemappingFileName(RemappingFile),
2340 LTOPhase(LTOPhase), FS(std::move(FS)),
2341 DisableSampleProfileInlining(DisableSampleProfileInlining),
2342 UseFlattenedProfile(UseFlattenedProfile) {}
2343
2348
2349 auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
2351 };
2352 auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
2354 };
2355 auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
2357 };
2358
2359 if (!FS)
2362
2363 SampleProfileLoader SampleLoader(
2364 ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
2365 ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
2366 : ProfileRemappingFileName,
2367 LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI, CG,
2368 DisableSampleProfileInlining, UseFlattenedProfile);
2369 if (!SampleLoader.doInitialization(M, &FAM))
2370 return PreservedAnalyses::all();
2371
2373 if (!SampleLoader.runOnModule(M, &AM, PSI))
2374 return PreservedAnalyses::all();
2375
2376 return PreservedAnalyses::none();
2377}
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
static bool runOnFunction(Function &F, bool PostInlining)
Provides ErrorOr<T> smart pointer.
static cl::opt< unsigned > SizeLimit("eif-limit", cl::init(6), cl::Hidden, cl::desc("Size limit in Hexagon early if-conversion"))
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
LVReader * CurrentReader
Definition: LVReader.cpp:152
Implements a lazy call graph analysis and related passes for the new pass manager.
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file implements a map that provides insertion order iteration.
static const Function * getCalledFunction(const Value *V)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
This file defines the PriorityQueue class.
This file contains the declarations for profiling metadata utility functions.
This builds on the llvm/ADT/GraphTraits.h file to find the strongly connected components (SCCs) of a ...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file provides the interface for context-sensitive profile tracker used by CSSPGO.
This file provides the interface for the sampled PGO profile loader base implementation.
This file provides the utility functions for the sampled PGO loader base implementation.
This file provides the interface for SampleProfileMatcher.
This file provides the interface for the pseudo probe implementation for AutoFDO.
static cl::opt< std::string > SampleProfileFile("sample-profile-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile file loaded by -sample-profile"), cl::Hidden)
static cl::opt< unsigned > MinfuncsForStalenessError("min-functions-for-staleness-error", cl::Hidden, cl::init(50), cl::desc("Skip the check if the number of hot functions is smaller than " "the specified number."))
cl::opt< bool > SalvageUnusedProfile("salvage-unused-profile", cl::Hidden, cl::init(false), cl::desc("Salvage unused profile by matching with new " "functions on call graph."))
static cl::opt< bool > ProfileSampleBlockAccurate("profile-sample-block-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "branches and calls as having 0 samples. Otherwise, treat " "them conservatively as unknown. "))
static cl::opt< unsigned > PrecentMismatchForStalenessError("precent-mismatch-for-staleness-error", cl::Hidden, cl::init(80), cl::desc("Reject the profile if the mismatch percent is higher than the " "given number."))
static cl::opt< bool > RemoveProbeAfterProfileAnnotation("sample-profile-remove-probe", cl::Hidden, cl::init(false), cl::desc("Remove pseudo-probe after sample profile annotation."))
static cl::opt< unsigned > MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden, cl::desc("Max number of promotions for a single indirect " "call callsite in sample profile loader"))
static cl::opt< ReplayInlinerSettings::Fallback > ProfileInlineReplayFallback("sample-profile-inline-replay-fallback", cl::init(ReplayInlinerSettings::Fallback::Original), cl::values(clEnumValN(ReplayInlinerSettings::Fallback::Original, "Original", "All decisions not in replay send to original advisor (default)"), clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline, "AlwaysInline", "All decisions not in replay are inlined"), clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline", "All decisions not in replay are not inlined")), cl::desc("How sample profile inline replay treats sites that don't come " "from the replay. Original: defers to original advisor, " "AlwaysInline: inline all sites not in replay, NeverInline: " "inline no sites not in replay"), cl::Hidden)
static cl::opt< bool > OverwriteExistingWeights("overwrite-existing-weights", cl::Hidden, cl::init(false), cl::desc("Ignore existing branch weights on IR and always overwrite."))
static void updateIDTMetaData(Instruction &Inst, const SmallVectorImpl< InstrProfValueData > &CallTargets, uint64_t Sum)
Update indirect call target profile metadata for Inst.
static cl::opt< bool > AnnotateSampleProfileInlinePhase("annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false), cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for " "sample-profile inline pass name."))
static cl::opt< std::string > ProfileInlineReplayFile("sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"), cl::desc("Optimization remarks file containing inline remarks to be replayed " "by inlining from sample profile loader."), cl::Hidden)
static cl::opt< bool > ProfileMergeInlinee("sample-profile-merge-inlinee", cl::Hidden, cl::init(true), cl::desc("Merge past inlinee's profile to outline version if sample " "profile loader decided not to inline a call site. It will " "only be enabled when top-down order of profile loading is " "enabled. "))
cl::opt< bool > PersistProfileStaleness("persist-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute stale profile statistical metrics and write it into the " "native object file(.llvm_stats section)."))
static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate)
Check whether the indirect call promotion history of Inst allows the promotion for Candidate.
static SmallVector< InstrProfValueData, 2 > GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M)
Returns the sorted CallTargetMap M by count in descending order.
#define CSINLINE_DEBUG
static cl::opt< bool > UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden, cl::desc("Process functions in a top-down order " "defined by the profiled call graph when " "-sample-profile-top-down-load is on."))
static cl::opt< ReplayInlinerSettings::Scope > ProfileInlineReplayScope("sample-profile-inline-replay-scope", cl::init(ReplayInlinerSettings::Scope::Function), cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function", "Replay on functions that have remarks associated " "with them (default)"), clEnumValN(ReplayInlinerSettings::Scope::Module, "Module", "Replay on the entire module")), cl::desc("Whether inline replay should be applied to the entire " "Module or just the Functions (default) that are present as " "callers in remarks during sample profile inlining."), cl::Hidden)
static cl::opt< unsigned > ProfileICPRelativeHotness("sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25), cl::desc("Relative hotness percentage threshold for indirect " "call promotion in proirity-based sample profile loader inlining."))
Function::ProfileCount ProfileCount
static cl::opt< unsigned > ProfileICPRelativeHotnessSkip("sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1), cl::desc("Skip relative hotness check for ICP up to given number of targets."))
cl::opt< bool > ReportProfileStaleness("report-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute and report stale profile statistical metrics."))
static cl::opt< bool > UsePreInlinerDecision("sample-profile-use-preinliner", cl::Hidden, cl::desc("Use the preinliner decisions stored in profile context."))
static cl::opt< bool > ProfileAccurateForSymsInList("profile-accurate-for-symsinlist", cl::Hidden, cl::init(true), cl::desc("For symbols in profile symbol list, regard their profiles to " "be accurate. It may be overriden by profile-sample-accurate. "))
#define DEBUG_TYPE
static cl::opt< bool > DisableSampleLoaderInlining("disable-sample-loader-inlining", cl::Hidden, cl::init(false), cl::desc("If true, artifically skip inline transformation in sample-loader " "pass, and merge (or scale) profiles (as configured by " "--sample-profile-merge-inlinee)."))
static cl::opt< bool > ProfileSizeInline("sample-profile-inline-size", cl::Hidden, cl::init(false), cl::desc("Inline cold call sites in profile loader if it's beneficial " "for code size."))
cl::opt< bool > SalvageStaleProfile("salvage-stale-profile", cl::Hidden, cl::init(false), cl::desc("Salvage stale profile by fuzzy matching and use the remapped " "location for sample profile query."))
static cl::opt< bool > ProfileTopDownLoad("sample-profile-top-down-load", cl::Hidden, cl::init(true), cl::desc("Do profile annotation and inlining for functions in top-down " "order of call graph during sample profile loading. It only " "works for new pass manager. "))
static cl::opt< bool > ProfileSampleAccurate("profile-sample-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "callsite and function as having 0 samples. Otherwise, treat " "un-sampled callsites and functions conservatively as unknown. "))
static cl::opt< bool > AllowRecursiveInline("sample-profile-recursive-inline", cl::Hidden, cl::desc("Allow sample loader inliner to inline recursive calls."))
static cl::opt< CallSiteFormat::Format > ProfileInlineReplayFormat("sample-profile-inline-replay-format", cl::init(CallSiteFormat::Format::LineColumnDiscriminator), cl::values(clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"), clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn", "<Line Number>:<Column Number>"), clEnumValN(CallSiteFormat::Format::LineDiscriminator, "LineDiscriminator", "<Line Number>.<Discriminator>"), clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator, "LineColumnDiscriminator", "<Line Number>:<Column Number>.<Discriminator> (default)")), cl::desc("How sample profile inline replay file is formatted"), cl::Hidden)
static cl::opt< std::string > SampleProfileRemappingFile("sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden)
static cl::opt< unsigned > HotFuncCutoffForStalenessError("hot-func-cutoff-for-staleness-error", cl::Hidden, cl::init(800000), cl::desc("A function is considered hot for staleness error check if its " "total sample count is above the specified percentile"))
static cl::opt< bool > CallsitePrioritizedInline("sample-profile-prioritized-inline", cl::Hidden, cl::desc("Use call site prioritized inlining for sample profile loader." "Currently only CSSPGO is supported."))
This file provides the interface for the sampled PGO loader pass.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
This pass exposes codegen information to IR-level passes.
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1349
This class represents a function call, abstracting a target machine's calling convention.
Debug location.
static bool isPseudoProbeDiscriminator(unsigned Discriminator)
const DILocation * cloneWithDiscriminator(unsigned Discriminator) const
Returns a new DILocation with updated Discriminator.
A debug info location.
Definition: DebugLoc.h:33
unsigned getLine() const
Definition: DebugLoc.cpp:24
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
Diagnostic information for the sample profiler.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Represents either an error or a value T.
Definition: ErrorOr.h:56
Class to represent profile counts.
Definition: Function.h:292
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1874
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:296
Represents the cost of inlining a function.
Definition: InlineCost.h:89
static InlineCost getNever(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:130
static InlineCost getAlways(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:125
static InlineCost get(int Cost, int Threshold, int StaticBonus=0)
Definition: InlineCost.h:119
This class captures the data input to the InlineFunction call, and records the auxiliary results prod...
Definition: Cloning.h:255
InlineResult is basically true or false.
Definition: InlineCost.h:179
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:567
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
bool extractProfTotalWeight(uint64_t &TotalVal) const
Retrieve total raw weight values of a branch.
Definition: Metadata.cpp:1788
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
A smart pointer to a reference-counted object that inherits from RefCountedBase or ThreadSafeRefCount...
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
An analysis pass which computes the call graph for a module.
A lazily constructed view of the call graph of a module.
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
VectorType::iterator erase(typename VectorType::iterator Iterator)
Remove the element given by Iterator.
Definition: MapVector.h:193
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Diagnostic information for optimization analysis remarks.
Diagnostic information for applied optimization remarks.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
void refresh()
If no summary is present, attempt to refresh.
bool isHotCountNthPercentile(int PercentileCutoff, uint64_t C) const
Returns true if count C is considered hot with regard to a given hot percentile cutoff value.
Sample profile inference pass.
void computeDominanceAndLoopInfo(FunctionT &F)
virtual ErrorOr< uint64_t > getInstWeight(const InstructionT &Inst)
Get the weight for an instruction.
virtual const FunctionSamples * findFunctionSamples(const InstructionT &I) const
Get the FunctionSamples for an instruction.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
SampleProfileLoaderPass(std::string File="", std::string RemappingFile="", ThinOrFullLTOPhase LTOPhase=ThinOrFullLTOPhase::None, IntrusiveRefCntPtr< vfs::FileSystem > FS=nullptr, bool DisableSampleProfileInlining=false, bool UseFlattenedProfile=false)
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:147
StringSet - A wrapper for StringMap that provides set-like functionality.
Definition: StringSet.h:23
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
const ParentTy * getParent() const
Definition: ilist_node.h:32
This class represents a function that is read from a sample profile.
Definition: FunctionId.h:36
Representation of the samples collected for a function.
Definition: SampleProf.h:745
void findInlinedFunctions(DenseSet< GlobalValue::GUID > &S, const HashKeyMap< std::unordered_map, FunctionId, Function * > &SymbolMap, uint64_t Threshold) const
Recursively traverses all children, if the total sample count of the corresponding function is no les...
Definition: SampleProf.h:1041
FunctionId getFunction() const
Return the function name.
Definition: SampleProf.h:1074
static StringRef getCanonicalFnName(const Function &F)
Return the canonical name for a function, taking into account suffix elision policy attributes.
Definition: SampleProf.h:1090
SampleContext & getContext() const
Definition: SampleProf.h:1192
sampleprof_error merge(const FunctionSamples &Other, uint64_t Weight=1)
Merge the samples in Other into this one.
Definition: SampleProf.h:998
static LineLocation getCallSiteIdentifier(const DILocation *DIL, bool ProfileIsFS=false)
Returns a unique call site identifier for a given debug location of a call instruction.
Definition: SampleProf.cpp:221
uint64_t getHeadSamplesEstimate() const
Return an estimate of the sample count of the function entry basic block.
Definition: SampleProf.h:949
uint64_t getGUID() const
Return the GUID of the context's name.
Definition: SampleProf.h:1211
const BodySampleMap & getBodySamples() const
Return all the samples collected in the body of the function.
Definition: SampleProf.h:973
static bool UseMD5
Whether the profile uses MD5 to represent string.
Definition: SampleProf.h:1197
This class is a wrapper to associative container MapT<KeyT, ValueT> using the hash value of the origi...
Definition: HashKeyMap.h:53
static void flattenProfile(SampleProfileMap &ProfileMap, bool ProfileIsCS=false)
Definition: SampleProf.h:1424
bool hasState(ContextStateMask S)
Definition: SampleProf.h:612
bool hasAttribute(ContextAttributeMask A)
Definition: SampleProf.h:608
This class provides operator overloads to the map container using MD5 as the key type,...
Definition: SampleProf.h:1313
Sample-based profile reader.
static ErrorOr< std::unique_ptr< SampleProfileReader > > create(StringRef Filename, LLVMContext &C, vfs::FileSystem &FS, FSDiscriminatorPass P=FSDiscriminatorPass::Base, StringRef RemapFilename="")
Create a sample profile reader appropriate to the file format.
std::unordered_map< FunctionId, uint64_t > CallTargetMap
Definition: SampleProf.h:338
static const SortedCallTargetSet sortCallTargets(const CallTargetMap &Targets)
Sort call targets in descending order of call frequency.
Definition: SampleProf.h:407
static const CallTargetMap adjustCallTargets(const CallTargetMap &Targets, float DistributionFactor)
Prorate call targets by a distribution factor.
Definition: SampleProf.h:416
Enumerate the SCCs of a directed graph in reverse topological order of the SCC DAG.
Definition: SCCIterator.h:49
bool isAtEnd() const
Direct loop termination test which is more efficient than comparison with end().
Definition: SCCIterator.h:113
Sort the nodes of a directed SCC in the decreasing order of the edge weights.
Definition: SCCIterator.h:253
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ FS
Definition: X86.h:211
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
void checkExpectAnnotations(Instruction &I, const ArrayRef< uint32_t > ExistingWeights, bool IsFrontend)
checkExpectAnnotations - compares PGO counters to the thresholds used for llvm.expect and warns if th...
Definition: MisExpect.cpp:202
DenseMap< SymbolStringPtr, ExecutorSymbolDef > SymbolMap
A map from symbol names (as SymbolStringPtrs) to JITSymbols (address/flags pairs).
DiagnosticInfoOptimizationBase::Argument NV
CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
static FunctionId getRepInFormat(StringRef Name)
Get the proper representation of a string according to whether the current Format uses MD5 to represe...
Definition: SampleProf.h:1299
std::map< FunctionId, FunctionSamples > FunctionSamplesMap
Definition: SampleProf.h:735
bool callsiteIsHot(const FunctionSamples *CallsiteFS, ProfileSummaryInfo *PSI, bool ProfAccForSymsInList)
Return true if the given callsite is hot wrt to hot cutoff threshold.
IntrusiveRefCntPtr< FileSystem > getRealFileSystem()
Gets an vfs::FileSystem for the 'real' file system, as seen by the operating system.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
cl::opt< int > ProfileInlineLimitMin
bool succ_empty(const Instruction *I)
Definition: CFG.h:255
scc_iterator< T > scc_begin(const T &G)
Construct the begin iterator for a deduced graph type T.
Definition: SCCIterator.h:233
static void buildTopDownFuncOrder(LazyCallGraph &CG, std::vector< Function * > &FunctionOrderList)
void setProbeDistributionFactor(Instruction &Inst, float Factor)
Definition: PseudoProbe.cpp:76
std::string AnnotateInlinePassName(InlineContext IC)
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
Definition: Pass.h:76
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
cl::opt< bool > SampleProfileUseProfi
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:1301
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
llvm::cl::opt< bool > UseIterativeBFIInference
std::optional< PseudoProbe > extractProbe(const Instruction &Inst)
Definition: PseudoProbe.cpp:56
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void emitInlinedIntoBasedOnCost(OptimizationRemarkEmitter &ORE, DebugLoc DLoc, const BasicBlock *Block, const Function &Callee, const Function &Caller, const InlineCost &IC, bool ForProfileContext=false, const char *PassName=nullptr)
Emit ORE message based in cost (default heuristic).
SmallVector< InstrProfValueData, 4 > getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind, uint32_t MaxNumValueData, uint64_t &TotalC, bool GetNoICPValue=false)
Extract the value profile data from Inst and returns them if Inst is annotated with value profile dat...
Definition: InstrProf.cpp:1369
std::unique_ptr< InlineAdvisor > getReplayInlineAdvisor(Module &M, FunctionAnalysisManager &FAM, LLVMContext &Context, std::unique_ptr< InlineAdvisor > OriginalAdvisor, const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks, InlineContext IC)
cl::opt< int > SampleHotCallSiteThreshold
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
void updateProfileCallee(Function *Callee, int64_t EntryDelta, const ValueMap< const Value *, WeakTrackingVH > *VMap=nullptr)
Updates profile information by adjusting the entry count by adding EntryDelta then scaling callsite i...
cl::opt< int > SampleColdCallSiteThreshold
InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, bool MergeAttributes=false, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true, Function *ForwardVarArgsTo=nullptr)
This function inlines the called function into the basic block of the caller.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1873
@ DS_Warning
static bool skipProfileForFunction(const Function &F)
cl::opt< bool > SortProfiledSCC
cl::opt< int > ProfileInlineLimitMax
cl::opt< bool > EnableExtTspBlockPlacement
const uint64_t NOMORE_ICP_MAGICNUM
Magic number in the value profile metadata showing a target has been promoted for the instruction and...
Definition: Metadata.h:57
cl::opt< int > ProfileInlineGrowthLimit
constexpr const char * PseudoProbeDescMetadataName
Definition: PseudoProbe.h:25
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
Used in the streaming interface as the general argument type.
A wrapper of binary function with basic blocks and jumps.
Provides context on when an inline advisor is constructed in the pipeline (e.g., link phase,...
Definition: InlineAdvisor.h:58
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:205
std::optional< bool > AllowRecursiveCall
Indicate whether we allow inlining for recursive call.
Definition: InlineCost.h:238
std::optional< bool > ComputeFullInlineCost
Compute inline cost even when the cost has exceeded the threshold.
Definition: InlineCost.h:232
static std::optional< uint32_t > extractDwarfBaseDiscriminator(uint32_t Value)
Definition: PseudoProbe.h:80