LLVM 20.0.0git
SampleProfile.cpp
Go to the documentation of this file.
1//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the SampleProfileLoader transformation. This pass
10// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
11// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
12// profile information in the given profile.
13//
14// This pass generates branch weight annotations on the IR:
15//
16// - prof: Represents branch weights. This annotation is added to branches
17// to indicate the weights of each edge coming out of the branch.
18// The weight of each edge is the weight of the target block for
19// that edge. The weight of a block B is computed as the maximum
20// number of samples found in B.
21//
22//===----------------------------------------------------------------------===//
23
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/ADT/DenseMap.h"
27#include "llvm/ADT/DenseSet.h"
28#include "llvm/ADT/MapVector.h"
32#include "llvm/ADT/Statistic.h"
33#include "llvm/ADT/StringRef.h"
34#include "llvm/ADT/Twine.h"
45#include "llvm/IR/BasicBlock.h"
46#include "llvm/IR/DebugLoc.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalValue.h"
50#include "llvm/IR/InstrTypes.h"
51#include "llvm/IR/Instruction.h"
54#include "llvm/IR/LLVMContext.h"
55#include "llvm/IR/MDBuilder.h"
56#include "llvm/IR/Module.h"
57#include "llvm/IR/PassManager.h"
59#include "llvm/IR/PseudoProbe.h"
66#include "llvm/Support/Debug.h"
70#include "llvm/Transforms/IPO.h"
81#include <algorithm>
82#include <cassert>
83#include <cstdint>
84#include <functional>
85#include <limits>
86#include <map>
87#include <memory>
88#include <queue>
89#include <string>
90#include <system_error>
91#include <utility>
92#include <vector>
93
94using namespace llvm;
95using namespace sampleprof;
96using namespace llvm::sampleprofutil;
98#define DEBUG_TYPE "sample-profile"
99#define CSINLINE_DEBUG DEBUG_TYPE "-inline"
100
101STATISTIC(NumCSInlined,
102 "Number of functions inlined with context sensitive profile");
103STATISTIC(NumCSNotInlined,
104 "Number of functions not inlined with context sensitive profile");
105STATISTIC(NumMismatchedProfile,
106 "Number of functions with CFG mismatched profile");
107STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
108STATISTIC(NumDuplicatedInlinesite,
109 "Number of inlined callsites with a partial distribution factor");
110
111STATISTIC(NumCSInlinedHitMinLimit,
112 "Number of functions with FDO inline stopped due to min size limit");
113STATISTIC(NumCSInlinedHitMaxLimit,
114 "Number of functions with FDO inline stopped due to max size limit");
116 NumCSInlinedHitGrowthLimit,
117 "Number of functions with FDO inline stopped due to growth size limit");
118
119// Command line option to specify the file to read samples from. This is
120// mainly used for debugging.
122 "sample-profile-file", cl::init(""), cl::value_desc("filename"),
123 cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
124
125// The named file contains a set of transformations that may have been applied
126// to the symbol names between the program from which the sample data was
127// collected and the current program's symbols.
129 "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
130 cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
131
133 "salvage-stale-profile", cl::Hidden, cl::init(false),
134 cl::desc("Salvage stale profile by fuzzy matching and use the remapped "
135 "location for sample profile query."));
137 SalvageUnusedProfile("salvage-unused-profile", cl::Hidden, cl::init(false),
138 cl::desc("Salvage unused profile by matching with new "
139 "functions on call graph."));
140
142 "report-profile-staleness", cl::Hidden, cl::init(false),
143 cl::desc("Compute and report stale profile statistical metrics."));
144
146 "persist-profile-staleness", cl::Hidden, cl::init(false),
147 cl::desc("Compute stale profile statistical metrics and write it into the "
148 "native object file(.llvm_stats section)."));
149
151 "profile-sample-accurate", cl::Hidden, cl::init(false),
152 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
153 "callsite and function as having 0 samples. Otherwise, treat "
154 "un-sampled callsites and functions conservatively as unknown. "));
155
157 "profile-sample-block-accurate", cl::Hidden, cl::init(false),
158 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
159 "branches and calls as having 0 samples. Otherwise, treat "
160 "them conservatively as unknown. "));
161
163 "profile-accurate-for-symsinlist", cl::Hidden, cl::init(true),
164 cl::desc("For symbols in profile symbol list, regard their profiles to "
165 "be accurate. It may be overridden by profile-sample-accurate. "));
166
168 "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
169 cl::desc("Merge past inlinee's profile to outline version if sample "
170 "profile loader decided not to inline a call site. It will "
171 "only be enabled when top-down order of profile loading is "
172 "enabled. "));
173
175 "sample-profile-top-down-load", cl::Hidden, cl::init(true),
176 cl::desc("Do profile annotation and inlining for functions in top-down "
177 "order of call graph during sample profile loading. It only "
178 "works for new pass manager. "));
179
180static cl::opt<bool>
181 UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden,
182 cl::desc("Process functions in a top-down order "
183 "defined by the profiled call graph when "
184 "-sample-profile-top-down-load is on."));
185
187 "sample-profile-inline-size", cl::Hidden, cl::init(false),
188 cl::desc("Inline cold call sites in profile loader if it's beneficial "
189 "for code size."));
190
191// Since profiles are consumed by many passes, turning on this option has
192// side effects. For instance, pre-link SCC inliner would see merged profiles
193// and inline the hot functions (that are skipped in this pass).
195 "disable-sample-loader-inlining", cl::Hidden, cl::init(false),
196 cl::desc(
197 "If true, artificially skip inline transformation in sample-loader "
198 "pass, and merge (or scale) profiles (as configured by "
199 "--sample-profile-merge-inlinee)."));
200
201namespace llvm {
203 SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,
204 cl::desc("Sort profiled recursion by edge weights."));
205
207 "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
208 cl::desc("The size growth ratio limit for proirity-based sample profile "
209 "loader inlining."));
210
212 "sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
213 cl::desc("The lower bound of size growth limit for "
214 "proirity-based sample profile loader inlining."));
215
217 "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
218 cl::desc("The upper bound of size growth limit for "
219 "proirity-based sample profile loader inlining."));
220
222 "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
223 cl::desc("Hot callsite threshold for proirity-based sample profile loader "
224 "inlining."));
225
227 "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
228 cl::desc("Threshold for inlining cold callsites"));
229} // namespace llvm
230
232 "sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25),
233 cl::desc(
234 "Relative hotness percentage threshold for indirect "
235 "call promotion in proirity-based sample profile loader inlining."));
236
238 "sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1),
239 cl::desc(
240 "Skip relative hotness check for ICP up to given number of targets."));
241
243 "hot-func-cutoff-for-staleness-error", cl::Hidden, cl::init(800000),
244 cl::desc("A function is considered hot for staleness error check if its "
245 "total sample count is above the specified percentile"));
246
248 "min-functions-for-staleness-error", cl::Hidden, cl::init(50),
249 cl::desc("Skip the check if the number of hot functions is smaller than "
250 "the specified number."));
251
253 "precent-mismatch-for-staleness-error", cl::Hidden, cl::init(80),
254 cl::desc("Reject the profile if the mismatch percent is higher than the "
255 "given number."));
256
258 "sample-profile-prioritized-inline", cl::Hidden,
259 cl::desc("Use call site prioritized inlining for sample profile loader. "
260 "Currently only CSSPGO is supported."));
261
263 "sample-profile-use-preinliner", cl::Hidden,
264 cl::desc("Use the preinliner decisions stored in profile context."));
265
267 "sample-profile-recursive-inline", cl::Hidden,
268 cl::desc("Allow sample loader inliner to inline recursive calls."));
269
271 "sample-profile-remove-probe", cl::Hidden, cl::init(false),
272 cl::desc("Remove pseudo-probe after sample profile annotation."));
273
275 "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
276 cl::desc(
277 "Optimization remarks file containing inline remarks to be replayed "
278 "by inlining from sample profile loader."),
279 cl::Hidden);
280
282 "sample-profile-inline-replay-scope",
283 cl::init(ReplayInlinerSettings::Scope::Function),
284 cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function",
285 "Replay on functions that have remarks associated "
286 "with them (default)"),
287 clEnumValN(ReplayInlinerSettings::Scope::Module, "Module",
288 "Replay on the entire module")),
289 cl::desc("Whether inline replay should be applied to the entire "
290 "Module or just the Functions (default) that are present as "
291 "callers in remarks during sample profile inlining."),
292 cl::Hidden);
293
295 "sample-profile-inline-replay-fallback",
296 cl::init(ReplayInlinerSettings::Fallback::Original),
299 ReplayInlinerSettings::Fallback::Original, "Original",
300 "All decisions not in replay send to original advisor (default)"),
301 clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline,
302 "AlwaysInline", "All decisions not in replay are inlined"),
303 clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline",
304 "All decisions not in replay are not inlined")),
305 cl::desc("How sample profile inline replay treats sites that don't come "
306 "from the replay. Original: defers to original advisor, "
307 "AlwaysInline: inline all sites not in replay, NeverInline: "
308 "inline no sites not in replay"),
309 cl::Hidden);
310
312 "sample-profile-inline-replay-format",
313 cl::init(CallSiteFormat::Format::LineColumnDiscriminator),
315 clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"),
316 clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn",
317 "<Line Number>:<Column Number>"),
318 clEnumValN(CallSiteFormat::Format::LineDiscriminator,
319 "LineDiscriminator", "<Line Number>.<Discriminator>"),
320 clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator,
321 "LineColumnDiscriminator",
322 "<Line Number>:<Column Number>.<Discriminator> (default)")),
323 cl::desc("How sample profile inline replay file is formatted"), cl::Hidden);
324
326 MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden,
327 cl::desc("Max number of promotions for a single indirect "
328 "call callsite in sample profile loader"));
329
331 "overwrite-existing-weights", cl::Hidden, cl::init(false),
332 cl::desc("Ignore existing branch weights on IR and always overwrite."));
333
335 "annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false),
336 cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for "
337 "sample-profile inline pass name."));
338
339namespace llvm {
341}
342
343namespace {
344
345using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
346using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
347using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
348using EdgeWeightMap = DenseMap<Edge, uint64_t>;
349using BlockEdgeMap =
351
352class GUIDToFuncNameMapper {
353public:
354 GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
355 DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
356 : CurrentReader(Reader), CurrentModule(M),
357 CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
358 if (!CurrentReader.useMD5())
359 return;
360
361 for (const auto &F : CurrentModule) {
362 StringRef OrigName = F.getName();
363 CurrentGUIDToFuncNameMap.insert(
364 {Function::getGUID(OrigName), OrigName});
365
366 // Local to global var promotion used by optimization like thinlto
367 // will rename the var and add suffix like ".llvm.xxx" to the
368 // original local name. In sample profile, the suffixes of function
369 // names are all stripped. Since it is possible that the mapper is
370 // built in post-thin-link phase and var promotion has been done,
371 // we need to add the substring of function name without the suffix
372 // into the GUIDToFuncNameMap.
374 if (CanonName != OrigName)
375 CurrentGUIDToFuncNameMap.insert(
376 {Function::getGUID(CanonName), CanonName});
377 }
378
379 // Update GUIDToFuncNameMap for each function including inlinees.
380 SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
381 }
382
383 ~GUIDToFuncNameMapper() {
384 if (!CurrentReader.useMD5())
385 return;
386
387 CurrentGUIDToFuncNameMap.clear();
388
389 // Reset GUIDToFuncNameMap for of each function as they're no
390 // longer valid at this point.
391 SetGUIDToFuncNameMapForAll(nullptr);
392 }
393
394private:
395 void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
396 std::queue<FunctionSamples *> FSToUpdate;
397 for (auto &IFS : CurrentReader.getProfiles()) {
398 FSToUpdate.push(&IFS.second);
399 }
400
401 while (!FSToUpdate.empty()) {
402 FunctionSamples *FS = FSToUpdate.front();
403 FSToUpdate.pop();
404 FS->GUIDToFuncNameMap = Map;
405 for (const auto &ICS : FS->getCallsiteSamples()) {
406 const FunctionSamplesMap &FSMap = ICS.second;
407 for (const auto &IFS : FSMap) {
408 FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
409 FSToUpdate.push(&FS);
410 }
411 }
412 }
413 }
414
416 Module &CurrentModule;
417 DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
418};
419
420// Inline candidate used by iterative callsite prioritized inliner
421struct InlineCandidate {
422 CallBase *CallInstr;
423 const FunctionSamples *CalleeSamples;
424 // Prorated callsite count, which will be used to guide inlining. For example,
425 // if a callsite is duplicated in LTO prelink, then in LTO postlink the two
426 // copies will get their own distribution factors and their prorated counts
427 // will be used to decide if they should be inlined independently.
428 uint64_t CallsiteCount;
429 // Call site distribution factor to prorate the profile samples for a
430 // duplicated callsite. Default value is 1.0.
431 float CallsiteDistribution;
432};
433
434// Inline candidate comparer using call site weight
435struct CandidateComparer {
436 bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {
437 if (LHS.CallsiteCount != RHS.CallsiteCount)
438 return LHS.CallsiteCount < RHS.CallsiteCount;
439
440 const FunctionSamples *LCS = LHS.CalleeSamples;
441 const FunctionSamples *RCS = RHS.CalleeSamples;
442 // In inline replay mode, CalleeSamples may be null and the order doesn't
443 // matter.
444 if (!LCS || !RCS)
445 return LCS;
446
447 // Tie breaker using number of samples try to favor smaller functions first
448 if (LCS->getBodySamples().size() != RCS->getBodySamples().size())
449 return LCS->getBodySamples().size() > RCS->getBodySamples().size();
450
451 // Tie breaker using GUID so we have stable/deterministic inlining order
452 return LCS->getGUID() < RCS->getGUID();
453 }
454};
455
456using CandidateQueue =
458 CandidateComparer>;
459
460/// Sample profile pass.
461///
462/// This pass reads profile data from the file specified by
463/// -sample-profile-file and annotates every affected function with the
464/// profile information found in that file.
465class SampleProfileLoader final : public SampleProfileLoaderBaseImpl<Function> {
466public:
467 SampleProfileLoader(
468 StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase,
470 std::function<AssumptionCache &(Function &)> GetAssumptionCache,
471 std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
472 std::function<const TargetLibraryInfo &(Function &)> GetTLI,
473 LazyCallGraph &CG, bool DisableSampleProfileInlining,
474 bool UseFlattenedProfile)
476 std::move(FS)),
477 GetAC(std::move(GetAssumptionCache)),
478 GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
479 CG(CG), LTOPhase(LTOPhase),
480 AnnotatedPassName(AnnotateSampleProfileInlinePhase
484 DisableSampleProfileInlining(DisableSampleProfileInlining),
485 UseFlattenedProfile(UseFlattenedProfile) {}
486
487 bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
488 bool runOnModule(Module &M, ModuleAnalysisManager *AM,
489 ProfileSummaryInfo *_PSI);
490
491protected:
493 bool emitAnnotations(Function &F);
495 const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
496 const FunctionSamples *
497 findFunctionSamples(const Instruction &I) const override;
498 std::vector<const FunctionSamples *>
499 findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
500 void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples,
501 DenseSet<GlobalValue::GUID> &InlinedGUIDs,
502 uint64_t Threshold);
503 // Attempt to promote indirect call and also inline the promoted call
504 bool tryPromoteAndInlineCandidate(
505 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
506 uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
507
508 bool inlineHotFunctions(Function &F,
509 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
510 std::optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB);
511 bool getExternalInlineAdvisorShouldInline(CallBase &CB);
512 InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
513 bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
514 bool
515 tryInlineCandidate(InlineCandidate &Candidate,
516 SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
517 bool
518 inlineHotFunctionsWithPriority(Function &F,
519 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
520 // Inline cold/small functions in addition to hot ones
521 bool shouldInlineColdCallee(CallBase &CallInst);
522 void emitOptimizationRemarksForInlineCandidates(
523 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
524 bool Hot);
525 void promoteMergeNotInlinedContextSamples(
527 const Function &F);
528 std::vector<Function *> buildFunctionOrder(Module &M, LazyCallGraph &CG);
529 std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(Module &M);
530 void generateMDProfMetadata(Function &F);
531 bool rejectHighStalenessProfile(Module &M, ProfileSummaryInfo *PSI,
532 const SampleProfileMap &Profiles);
533 void removePseudoProbeInstsDiscriminator(Module &M);
534
535 /// Map from function name to Function *. Used to find the function from
536 /// the function name. If the function name contains suffix, additional
537 /// entry is added to map from the stripped name to the function if there
538 /// is one-to-one mapping.
540
541 /// Map from function name to profile name generated by call-graph based
542 /// profile fuzzy matching(--salvage-unused-profile).
544
545 std::function<AssumptionCache &(Function &)> GetAC;
546 std::function<TargetTransformInfo &(Function &)> GetTTI;
547 std::function<const TargetLibraryInfo &(Function &)> GetTLI;
548 LazyCallGraph &CG;
549
550 /// Profile tracker for different context.
551 std::unique_ptr<SampleContextTracker> ContextTracker;
552
553 /// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
554 ///
555 /// We need to know the LTO phase because for example in ThinLTOPrelink
556 /// phase, in annotation, we should not promote indirect calls. Instead,
557 /// we will mark GUIDs that needs to be annotated to the function.
558 const ThinOrFullLTOPhase LTOPhase;
559 const std::string AnnotatedPassName;
560
561 /// Profle Symbol list tells whether a function name appears in the binary
562 /// used to generate the current profile.
563 std::shared_ptr<ProfileSymbolList> PSL;
564
565 /// Total number of samples collected in this profile.
566 ///
567 /// This is the sum of all the samples collected in all the functions executed
568 /// at runtime.
569 uint64_t TotalCollectedSamples = 0;
570
571 // Information recorded when we declined to inline a call site
572 // because we have determined it is too cold is accumulated for
573 // each callee function. Initially this is just the entry count.
574 struct NotInlinedProfileInfo {
575 uint64_t entryCount;
576 };
578
579 // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
580 // all the function symbols defined or declared in current module.
581 DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
582
583 // All the Names used in FunctionSamples including outline function
584 // names, inline instance names and call target names.
585 StringSet<> NamesInProfile;
586 // MD5 version of NamesInProfile. Either NamesInProfile or GUIDsInProfile is
587 // populated, depends on whether the profile uses MD5. Because the name table
588 // generally contains several magnitude more entries than the number of
589 // functions, we do not want to convert all names from one form to another.
590 llvm::DenseSet<uint64_t> GUIDsInProfile;
591
592 // For symbol in profile symbol list, whether to regard their profiles
593 // to be accurate. It is mainly decided by existance of profile symbol
594 // list and -profile-accurate-for-symsinlist flag, but it can be
595 // overriden by -profile-sample-accurate or profile-sample-accurate
596 // attribute.
597 bool ProfAccForSymsInList;
598
599 bool DisableSampleProfileInlining;
600
601 bool UseFlattenedProfile;
602
603 // External inline advisor used to replay inline decision from remarks.
604 std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor;
605
606 // A helper to implement the sample profile matching algorithm.
607 std::unique_ptr<SampleProfileMatcher> MatchingManager;
608
609private:
610 const char *getAnnotatedRemarkPassName() const {
611 return AnnotatedPassName.c_str();
612 }
613};
614} // end anonymous namespace
615
616namespace llvm {
617template <>
619 return succ_empty(BB);
620}
621
622template <>
624 const std::vector<const BasicBlockT *> &BasicBlocks,
625 BlockEdgeMap &Successors, FlowFunction &Func) {
626 for (auto &Jump : Func.Jumps) {
627 const auto *BB = BasicBlocks[Jump.Source];
628 const auto *Succ = BasicBlocks[Jump.Target];
629 const Instruction *TI = BB->getTerminator();
630 // Check if a block ends with InvokeInst and mark non-taken branch unlikely.
631 // In that case block Succ should be a landing pad
632 if (Successors[BB].size() == 2 && Successors[BB].back() == Succ) {
633 if (isa<InvokeInst>(TI)) {
634 Jump.IsUnlikely = true;
635 }
636 }
637 const Instruction *SuccTI = Succ->getTerminator();
638 // Check if the target block contains UnreachableInst and mark it unlikely
639 if (SuccTI->getNumSuccessors() == 0) {
640 if (isa<UnreachableInst>(SuccTI)) {
641 Jump.IsUnlikely = true;
642 }
643 }
644 }
645}
646
647template <>
649 Function &F) {
650 DT.reset(new DominatorTree);
651 DT->recalculate(F);
652
653 PDT.reset(new PostDominatorTree(F));
654
655 LI.reset(new LoopInfo);
656 LI->analyze(*DT);
657}
658} // namespace llvm
659
660ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
662 return getProbeWeight(Inst);
663
664 const DebugLoc &DLoc = Inst.getDebugLoc();
665 if (!DLoc)
666 return std::error_code();
667
668 // Ignore all intrinsics, phinodes and branch instructions.
669 // Branch and phinodes instruction usually contains debug info from sources
670 // outside of the residing basic block, thus we ignore them during annotation.
671 if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
672 return std::error_code();
673
674 // For non-CS profile, if a direct call/invoke instruction is inlined in
675 // profile (findCalleeFunctionSamples returns non-empty result), but not
676 // inlined here, it means that the inlined callsite has no sample, thus the
677 // call instruction should have 0 count.
678 // For CS profile, the callsite count of previously inlined callees is
679 // populated with the entry count of the callees.
681 if (const auto *CB = dyn_cast<CallBase>(&Inst))
682 if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
683 return 0;
684
685 return getInstWeightImpl(Inst);
686}
687
688/// Get the FunctionSamples for a call instruction.
689///
690/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
691/// instance in which that call instruction is calling to. It contains
692/// all samples that resides in the inlined instance. We first find the
693/// inlined instance in which the call instruction is from, then we
694/// traverse its children to find the callsite with the matching
695/// location.
696///
697/// \param Inst Call/Invoke instruction to query.
698///
699/// \returns The FunctionSamples pointer to the inlined instance.
700const FunctionSamples *
701SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
702 const DILocation *DIL = Inst.getDebugLoc();
703 if (!DIL) {
704 return nullptr;
705 }
706
707 StringRef CalleeName;
708 if (Function *Callee = Inst.getCalledFunction())
709 CalleeName = Callee->getName();
710
712 return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
713
714 const FunctionSamples *FS = findFunctionSamples(Inst);
715 if (FS == nullptr)
716 return nullptr;
717
718 return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL),
719 CalleeName, Reader->getRemapper(),
720 &FuncNameToProfNameMap);
721}
722
723/// Returns a vector of FunctionSamples that are the indirect call targets
724/// of \p Inst. The vector is sorted by the total number of samples. Stores
725/// the total call count of the indirect call in \p Sum.
726std::vector<const FunctionSamples *>
727SampleProfileLoader::findIndirectCallFunctionSamples(
728 const Instruction &Inst, uint64_t &Sum) const {
729 const DILocation *DIL = Inst.getDebugLoc();
730 std::vector<const FunctionSamples *> R;
731
732 if (!DIL) {
733 return R;
734 }
735
736 auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
737 assert(L && R && "Expect non-null FunctionSamples");
738 if (L->getHeadSamplesEstimate() != R->getHeadSamplesEstimate())
739 return L->getHeadSamplesEstimate() > R->getHeadSamplesEstimate();
740 return L->getGUID() < R->getGUID();
741 };
742
744 auto CalleeSamples =
745 ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
746 if (CalleeSamples.empty())
747 return R;
748
749 // For CSSPGO, we only use target context profile's entry count
750 // as that already includes both inlined callee and non-inlined ones..
751 Sum = 0;
752 for (const auto *const FS : CalleeSamples) {
753 Sum += FS->getHeadSamplesEstimate();
754 R.push_back(FS);
755 }
756 llvm::sort(R, FSCompare);
757 return R;
758 }
759
760 const FunctionSamples *FS = findFunctionSamples(Inst);
761 if (FS == nullptr)
762 return R;
763
765 Sum = 0;
766 if (auto T = FS->findCallTargetMapAt(CallSite))
767 for (const auto &T_C : *T)
768 Sum += T_C.second;
769 if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
770 if (M->empty())
771 return R;
772 for (const auto &NameFS : *M) {
773 Sum += NameFS.second.getHeadSamplesEstimate();
774 R.push_back(&NameFS.second);
775 }
776 llvm::sort(R, FSCompare);
777 }
778 return R;
779}
780
781const FunctionSamples *
782SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
784 std::optional<PseudoProbe> Probe = extractProbe(Inst);
785 if (!Probe)
786 return nullptr;
787 }
788
789 const DILocation *DIL = Inst.getDebugLoc();
790 if (!DIL)
791 return Samples;
792
793 auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
794 if (it.second) {
796 it.first->second = ContextTracker->getContextSamplesFor(DIL);
797 else
798 it.first->second = Samples->findFunctionSamples(
799 DIL, Reader->getRemapper(), &FuncNameToProfNameMap);
800 }
801 return it.first->second;
802}
803
804/// Check whether the indirect call promotion history of \p Inst allows
805/// the promotion for \p Candidate.
806/// If the profile count for the promotion candidate \p Candidate is
807/// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted
808/// for \p Inst. If we already have at least MaxNumPromotions
809/// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we
810/// cannot promote for \p Inst anymore.
811static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) {
812 uint64_t TotalCount = 0;
813 auto ValueData = getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget,
814 MaxNumPromotions, TotalCount, true);
815 // No valid value profile so no promoted targets have been recorded
816 // before. Ok to do ICP.
817 if (ValueData.empty())
818 return true;
819
820 unsigned NumPromoted = 0;
821 for (const auto &V : ValueData) {
822 if (V.Count != NOMORE_ICP_MAGICNUM)
823 continue;
824
825 // If the promotion candidate has NOMORE_ICP_MAGICNUM count in the
826 // metadata, it means the candidate has been promoted for this
827 // indirect call.
828 if (V.Value == Function::getGUID(Candidate))
829 return false;
830 NumPromoted++;
831 // If already have MaxNumPromotions promotion, don't do it anymore.
832 if (NumPromoted == MaxNumPromotions)
833 return false;
834 }
835 return true;
836}
837
838/// Update indirect call target profile metadata for \p Inst.
839/// Usually \p Sum is the sum of counts of all the targets for \p Inst.
840/// If it is 0, it means updateIDTMetaData is used to mark a
841/// certain target to be promoted already. If it is not zero,
842/// we expect to use it to update the total count in the value profile.
843static void
845 const SmallVectorImpl<InstrProfValueData> &CallTargets,
846 uint64_t Sum) {
847 // Bail out early if MaxNumPromotions is zero.
848 // This prevents allocating an array of zero length below.
849 //
850 // Note `updateIDTMetaData` is called in two places so check
851 // `MaxNumPromotions` inside it.
852 if (MaxNumPromotions == 0)
853 return;
854 // OldSum is the existing total count in the value profile data.
855 uint64_t OldSum = 0;
856 auto ValueData = getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget,
857 MaxNumPromotions, OldSum, true);
858
859 DenseMap<uint64_t, uint64_t> ValueCountMap;
860 if (Sum == 0) {
861 assert((CallTargets.size() == 1 &&
862 CallTargets[0].Count == NOMORE_ICP_MAGICNUM) &&
863 "If sum is 0, assume only one element in CallTargets "
864 "with count being NOMORE_ICP_MAGICNUM");
865 // Initialize ValueCountMap with existing value profile data.
866 for (const auto &V : ValueData)
867 ValueCountMap[V.Value] = V.Count;
868 auto Pair =
869 ValueCountMap.try_emplace(CallTargets[0].Value, CallTargets[0].Count);
870 // If the target already exists in value profile, decrease the total
871 // count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM.
872 if (!Pair.second) {
873 OldSum -= Pair.first->second;
874 Pair.first->second = NOMORE_ICP_MAGICNUM;
875 }
876 Sum = OldSum;
877 } else {
878 // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM
879 // counts in the value profile.
880 for (const auto &V : ValueData) {
881 if (V.Count == NOMORE_ICP_MAGICNUM)
882 ValueCountMap[V.Value] = V.Count;
883 }
884
885 for (const auto &Data : CallTargets) {
886 auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count);
887 if (Pair.second)
888 continue;
889 // The target represented by Data.Value has already been promoted.
890 // Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease
891 // Sum by Data.Count.
892 assert(Sum >= Data.Count && "Sum should never be less than Data.Count");
893 Sum -= Data.Count;
894 }
895 }
896
898 for (const auto &ValueCount : ValueCountMap) {
899 NewCallTargets.emplace_back(
900 InstrProfValueData{ValueCount.first, ValueCount.second});
901 }
902
903 llvm::sort(NewCallTargets,
904 [](const InstrProfValueData &L, const InstrProfValueData &R) {
905 if (L.Count != R.Count)
906 return L.Count > R.Count;
907 return L.Value > R.Value;
908 });
909
910 uint32_t MaxMDCount =
911 std::min(NewCallTargets.size(), static_cast<size_t>(MaxNumPromotions));
912 annotateValueSite(*Inst.getParent()->getParent()->getParent(), Inst,
913 NewCallTargets, Sum, IPVK_IndirectCallTarget, MaxMDCount);
914}
915
916/// Attempt to promote indirect call and also inline the promoted call.
917///
918/// \param F Caller function.
919/// \param Candidate ICP and inline candidate.
920/// \param SumOrigin Original sum of target counts for indirect call before
921/// promoting given candidate.
922/// \param Sum Prorated sum of remaining target counts for indirect call
923/// after promoting given candidate.
924/// \param InlinedCallSite Output vector for new call sites exposed after
925/// inlining.
926bool SampleProfileLoader::tryPromoteAndInlineCandidate(
927 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
928 SmallVector<CallBase *, 8> *InlinedCallSite) {
929 // Bail out early if sample-loader inliner is disabled.
930 if (DisableSampleProfileInlining)
931 return false;
932
933 // Bail out early if MaxNumPromotions is zero.
934 // This prevents allocating an array of zero length in callees below.
935 if (MaxNumPromotions == 0)
936 return false;
937 auto CalleeFunctionName = Candidate.CalleeSamples->getFunction();
938 auto R = SymbolMap.find(CalleeFunctionName);
939 if (R == SymbolMap.end() || !R->second)
940 return false;
941
942 auto &CI = *Candidate.CallInstr;
943 if (!doesHistoryAllowICP(CI, R->second->getName()))
944 return false;
945
946 const char *Reason = "Callee function not available";
947 // R->getValue() != &F is to prevent promoting a recursive call.
948 // If it is a recursive call, we do not inline it as it could bloat
949 // the code exponentially. There is way to better handle this, e.g.
950 // clone the caller first, and inline the cloned caller if it is
951 // recursive. As llvm does not inline recursive calls, we will
952 // simply ignore it instead of handling it explicitly.
953 if (!R->second->isDeclaration() && R->second->getSubprogram() &&
954 R->second->hasFnAttribute("use-sample-profile") &&
955 R->second != &F && isLegalToPromote(CI, R->second, &Reason)) {
956 // For promoted target, set its value with NOMORE_ICP_MAGICNUM count
957 // in the value profile metadata so the target won't be promoted again.
958 SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{
959 Function::getGUID(R->second->getName()), NOMORE_ICP_MAGICNUM}};
960 updateIDTMetaData(CI, SortedCallTargets, 0);
961
962 auto *DI = &pgo::promoteIndirectCall(
963 CI, R->second, Candidate.CallsiteCount, Sum, false, ORE);
964 if (DI) {
965 Sum -= Candidate.CallsiteCount;
966 // Do not prorate the indirect callsite distribution since the original
967 // distribution will be used to scale down non-promoted profile target
968 // counts later. By doing this we lose track of the real callsite count
969 // for the leftover indirect callsite as a trade off for accurate call
970 // target counts.
971 // TODO: Ideally we would have two separate factors, one for call site
972 // counts and one is used to prorate call target counts.
973 // Do not update the promoted direct callsite distribution at this
974 // point since the original distribution combined with the callee profile
975 // will be used to prorate callsites from the callee if inlined. Once not
976 // inlined, the direct callsite distribution should be prorated so that
977 // the it will reflect the real callsite counts.
978 Candidate.CallInstr = DI;
979 if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {
980 bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);
981 if (!Inlined) {
982 // Prorate the direct callsite distribution so that it reflects real
983 // callsite counts.
985 *DI, static_cast<float>(Candidate.CallsiteCount) / SumOrigin);
986 }
987 return Inlined;
988 }
989 }
990 } else {
991 LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
993 Candidate.CallInstr->getName())<< " because "
994 << Reason << "\n");
995 }
996 return false;
997}
998
999bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
1000 if (!ProfileSizeInline)
1001 return false;
1002
1004 if (Callee == nullptr)
1005 return false;
1006
1008 GetAC, GetTLI);
1009
1010 if (Cost.isNever())
1011 return false;
1012
1013 if (Cost.isAlways())
1014 return true;
1015
1016 return Cost.getCost() <= SampleColdCallSiteThreshold;
1017}
1018
1019void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
1020 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
1021 bool Hot) {
1022 for (auto *I : Candidates) {
1023 Function *CalledFunction = I->getCalledFunction();
1024 if (CalledFunction) {
1025 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1026 "InlineAttempt", I->getDebugLoc(),
1027 I->getParent())
1028 << "previous inlining reattempted for "
1029 << (Hot ? "hotness: '" : "size: '")
1030 << ore::NV("Callee", CalledFunction) << "' into '"
1031 << ore::NV("Caller", &F) << "'");
1032 }
1033 }
1034}
1035
1036void SampleProfileLoader::findExternalInlineCandidate(
1037 CallBase *CB, const FunctionSamples *Samples,
1038 DenseSet<GlobalValue::GUID> &InlinedGUIDs, uint64_t Threshold) {
1039
1040 // If ExternalInlineAdvisor(ReplayInlineAdvisor) wants to inline an external
1041 // function make sure it's imported
1042 if (CB && getExternalInlineAdvisorShouldInline(*CB)) {
1043 // Samples may not exist for replayed function, if so
1044 // just add the direct GUID and move on
1045 if (!Samples) {
1046 InlinedGUIDs.insert(
1047 Function::getGUID(CB->getCalledFunction()->getName()));
1048 return;
1049 }
1050 // Otherwise, drop the threshold to import everything that we can
1051 Threshold = 0;
1052 }
1053
1054 // In some rare cases, call instruction could be changed after being pushed
1055 // into inline candidate queue, this is because earlier inlining may expose
1056 // constant propagation which can change indirect call to direct call. When
1057 // this happens, we may fail to find matching function samples for the
1058 // candidate later, even if a match was found when the candidate was enqueued.
1059 if (!Samples)
1060 return;
1061
1062 // For AutoFDO profile, retrieve candidate profiles by walking over
1063 // the nested inlinee profiles.
1065 // Set threshold to zero to honor pre-inliner decision.
1067 Threshold = 0;
1068 Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);
1069 return;
1070 }
1071
1072 ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(Samples);
1073 std::queue<ContextTrieNode *> CalleeList;
1074 CalleeList.push(Caller);
1075 while (!CalleeList.empty()) {
1076 ContextTrieNode *Node = CalleeList.front();
1077 CalleeList.pop();
1078 FunctionSamples *CalleeSample = Node->getFunctionSamples();
1079 // For CSSPGO profile, retrieve candidate profile by walking over the
1080 // trie built for context profile. Note that also take call targets
1081 // even if callee doesn't have a corresponding context profile.
1082 if (!CalleeSample)
1083 continue;
1084
1085 // If pre-inliner decision is used, honor that for importing as well.
1086 bool PreInline =
1089 if (!PreInline && CalleeSample->getHeadSamplesEstimate() < Threshold)
1090 continue;
1091
1092 Function *Func = SymbolMap.lookup(CalleeSample->getFunction());
1093 // Add to the import list only when it's defined out of module.
1094 if (!Func || Func->isDeclaration())
1095 InlinedGUIDs.insert(CalleeSample->getGUID());
1096
1097 // Import hot CallTargets, which may not be available in IR because full
1098 // profile annotation cannot be done until backend compilation in ThinLTO.
1099 for (const auto &BS : CalleeSample->getBodySamples())
1100 for (const auto &TS : BS.second.getCallTargets())
1101 if (TS.second > Threshold) {
1102 const Function *Callee = SymbolMap.lookup(TS.first);
1103 if (!Callee || Callee->isDeclaration())
1104 InlinedGUIDs.insert(TS.first.getHashCode());
1105 }
1106
1107 // Import hot child context profile associted with callees. Note that this
1108 // may have some overlap with the call target loop above, but doing this
1109 // based child context profile again effectively allow us to use the max of
1110 // entry count and call target count to determine importing.
1111 for (auto &Child : Node->getAllChildContext()) {
1112 ContextTrieNode *CalleeNode = &Child.second;
1113 CalleeList.push(CalleeNode);
1114 }
1115 }
1116}
1117
1118/// Iteratively inline hot callsites of a function.
1119///
1120/// Iteratively traverse all callsites of the function \p F, so as to
1121/// find out callsites with corresponding inline instances.
1122///
1123/// For such callsites,
1124/// - If it is hot enough, inline the callsites and adds callsites of the callee
1125/// into the caller. If the call is an indirect call, first promote
1126/// it to direct call. Each indirect call is limited with a single target.
1127///
1128/// - If a callsite is not inlined, merge the its profile to the outline
1129/// version (if --sample-profile-merge-inlinee is true), or scale the
1130/// counters of standalone function based on the profile of inlined
1131/// instances (if --sample-profile-merge-inlinee is false).
1132///
1133/// Later passes may consume the updated profiles.
1134///
1135/// \param F function to perform iterative inlining.
1136/// \param InlinedGUIDs a set to be updated to include all GUIDs that are
1137/// inlined in the profiled binary.
1138///
1139/// \returns True if there is any inline happened.
1140bool SampleProfileLoader::inlineHotFunctions(
1141 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1142 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1143 // Profile symbol list is ignored when profile-sample-accurate is on.
1144 assert((!ProfAccForSymsInList ||
1146 !F.hasFnAttribute("profile-sample-accurate"))) &&
1147 "ProfAccForSymsInList should be false when profile-sample-accurate "
1148 "is enabled");
1149
1150 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1151 bool Changed = false;
1152 bool LocalChanged = true;
1153 while (LocalChanged) {
1154 LocalChanged = false;
1156 for (auto &BB : F) {
1157 bool Hot = false;
1158 SmallVector<CallBase *, 10> AllCandidates;
1159 SmallVector<CallBase *, 10> ColdCandidates;
1160 for (auto &I : BB) {
1161 const FunctionSamples *FS = nullptr;
1162 if (auto *CB = dyn_cast<CallBase>(&I)) {
1163 if (!isa<IntrinsicInst>(I)) {
1164 if ((FS = findCalleeFunctionSamples(*CB))) {
1165 assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
1166 "GUIDToFuncNameMap has to be populated");
1167 AllCandidates.push_back(CB);
1168 if (FS->getHeadSamplesEstimate() > 0 ||
1170 LocalNotInlinedCallSites.insert({CB, FS});
1171 if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1172 Hot = true;
1173 else if (shouldInlineColdCallee(*CB))
1174 ColdCandidates.push_back(CB);
1175 } else if (getExternalInlineAdvisorShouldInline(*CB)) {
1176 AllCandidates.push_back(CB);
1177 }
1178 }
1179 }
1180 }
1181 if (Hot || ExternalInlineAdvisor) {
1182 CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
1183 emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
1184 } else {
1185 CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
1186 emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
1187 }
1188 }
1189 for (CallBase *I : CIS) {
1190 Function *CalledFunction = I->getCalledFunction();
1191 InlineCandidate Candidate = {I, LocalNotInlinedCallSites.lookup(I),
1192 0 /* dummy count */,
1193 1.0 /* dummy distribution factor */};
1194 // Do not inline recursive calls.
1195 if (CalledFunction == &F)
1196 continue;
1197 if (I->isIndirectCall()) {
1198 uint64_t Sum;
1199 for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
1200 uint64_t SumOrigin = Sum;
1201 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1202 findExternalInlineCandidate(I, FS, InlinedGUIDs,
1203 PSI->getOrCompHotCountThreshold());
1204 continue;
1205 }
1206 if (!callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1207 continue;
1208
1209 Candidate = {I, FS, FS->getHeadSamplesEstimate(), 1.0};
1210 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) {
1211 LocalNotInlinedCallSites.erase(I);
1212 LocalChanged = true;
1213 }
1214 }
1215 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1216 !CalledFunction->isDeclaration()) {
1217 if (tryInlineCandidate(Candidate)) {
1218 LocalNotInlinedCallSites.erase(I);
1219 LocalChanged = true;
1220 }
1221 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1222 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1223 InlinedGUIDs,
1224 PSI->getOrCompHotCountThreshold());
1225 }
1226 }
1227 Changed |= LocalChanged;
1228 }
1229
1230 // For CS profile, profile for not inlined context will be merged when
1231 // base profile is being retrieved.
1233 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1234 return Changed;
1235}
1236
1237bool SampleProfileLoader::tryInlineCandidate(
1238 InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
1239 // Do not attempt to inline a candidate if
1240 // --disable-sample-loader-inlining is true.
1241 if (DisableSampleProfileInlining)
1242 return false;
1243
1244 CallBase &CB = *Candidate.CallInstr;
1245 Function *CalledFunction = CB.getCalledFunction();
1246 assert(CalledFunction && "Expect a callee with definition");
1247 DebugLoc DLoc = CB.getDebugLoc();
1248 BasicBlock *BB = CB.getParent();
1249
1250 InlineCost Cost = shouldInlineCandidate(Candidate);
1251 if (Cost.isNever()) {
1252 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1253 "InlineFail", DLoc, BB)
1254 << "incompatible inlining");
1255 return false;
1256 }
1257
1258 if (!Cost)
1259 return false;
1260
1261 InlineFunctionInfo IFI(GetAC);
1262 IFI.UpdateProfile = false;
1263 InlineResult IR = InlineFunction(CB, IFI,
1264 /*MergeAttributes=*/true);
1265 if (!IR.isSuccess())
1266 return false;
1267
1268 // The call to InlineFunction erases I, so we can't pass it here.
1269 emitInlinedIntoBasedOnCost(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(),
1270 Cost, true, getAnnotatedRemarkPassName());
1271
1272 // Now populate the list of newly exposed call sites.
1273 if (InlinedCallSites) {
1274 InlinedCallSites->clear();
1275 for (auto &I : IFI.InlinedCallSites)
1276 InlinedCallSites->push_back(I);
1277 }
1278
1280 ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
1281 ++NumCSInlined;
1282
1283 // Prorate inlined probes for a duplicated inlining callsite which probably
1284 // has a distribution less than 100%. Samples for an inlinee should be
1285 // distributed among the copies of the original callsite based on each
1286 // callsite's distribution factor for counts accuracy. Note that an inlined
1287 // probe may come with its own distribution factor if it has been duplicated
1288 // in the inlinee body. The two factor are multiplied to reflect the
1289 // aggregation of duplication.
1290 if (Candidate.CallsiteDistribution < 1) {
1291 for (auto &I : IFI.InlinedCallSites) {
1292 if (std::optional<PseudoProbe> Probe = extractProbe(*I))
1293 setProbeDistributionFactor(*I, Probe->Factor *
1294 Candidate.CallsiteDistribution);
1295 }
1296 NumDuplicatedInlinesite++;
1297 }
1298
1299 return true;
1300}
1301
1302bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
1303 CallBase *CB) {
1304 assert(CB && "Expect non-null call instruction");
1305
1306 if (isa<IntrinsicInst>(CB))
1307 return false;
1308
1309 // Find the callee's profile. For indirect call, find hottest target profile.
1310 const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
1311 // If ExternalInlineAdvisor wants to inline this site, do so even
1312 // if Samples are not present.
1313 if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(*CB))
1314 return false;
1315
1316 float Factor = 1.0;
1317 if (std::optional<PseudoProbe> Probe = extractProbe(*CB))
1318 Factor = Probe->Factor;
1319
1320 uint64_t CallsiteCount =
1321 CalleeSamples ? CalleeSamples->getHeadSamplesEstimate() * Factor : 0;
1322 *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
1323 return true;
1324}
1325
1326std::optional<InlineCost>
1327SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) {
1328 std::unique_ptr<InlineAdvice> Advice = nullptr;
1329 if (ExternalInlineAdvisor) {
1330 Advice = ExternalInlineAdvisor->getAdvice(CB);
1331 if (Advice) {
1332 if (!Advice->isInliningRecommended()) {
1333 Advice->recordUnattemptedInlining();
1334 return InlineCost::getNever("not previously inlined");
1335 }
1336 Advice->recordInlining();
1337 return InlineCost::getAlways("previously inlined");
1338 }
1339 }
1340
1341 return {};
1342}
1343
1344bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) {
1345 std::optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB);
1346 return Cost ? !!*Cost : false;
1347}
1348
1350SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
1351 if (std::optional<InlineCost> ReplayCost =
1352 getExternalInlineAdvisorCost(*Candidate.CallInstr))
1353 return *ReplayCost;
1354 // Adjust threshold based on call site hotness, only do this for callsite
1355 // prioritized inliner because otherwise cost-benefit check is done earlier.
1356 int SampleThreshold = SampleColdCallSiteThreshold;
1358 if (Candidate.CallsiteCount > PSI->getHotCountThreshold())
1359 SampleThreshold = SampleHotCallSiteThreshold;
1360 else if (!ProfileSizeInline)
1361 return InlineCost::getNever("cold callsite");
1362 }
1363
1364 Function *Callee = Candidate.CallInstr->getCalledFunction();
1365 assert(Callee && "Expect a definition for inline candidate of direct call");
1366
1367 InlineParams Params = getInlineParams();
1368 // We will ignore the threshold from inline cost, so always get full cost.
1369 Params.ComputeFullInlineCost = true;
1371 // Checks if there is anything in the reachable portion of the callee at
1372 // this callsite that makes this inlining potentially illegal. Need to
1373 // set ComputeFullInlineCost, otherwise getInlineCost may return early
1374 // when cost exceeds threshold without checking all IRs in the callee.
1375 // The acutal cost does not matter because we only checks isNever() to
1376 // see if it is legal to inline the callsite.
1377 InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
1378 GetTTI(*Callee), GetAC, GetTLI);
1379
1380 // Honor always inline and never inline from call analyzer
1381 if (Cost.isNever() || Cost.isAlways())
1382 return Cost;
1383
1384 // With CSSPGO, the preinliner in llvm-profgen can estimate global inline
1385 // decisions based on hotness as well as accurate function byte sizes for
1386 // given context using function/inlinee sizes from previous build. It
1387 // stores the decision in profile, and also adjust/merge context profile
1388 // aiming at better context-sensitive post-inline profile quality, assuming
1389 // all inline decision estimates are going to be honored by compiler. Here
1390 // we replay that inline decision under `sample-profile-use-preinliner`.
1391 // Note that we don't need to handle negative decision from preinliner as
1392 // context profile for not inlined calls are merged by preinliner already.
1393 if (UsePreInlinerDecision && Candidate.CalleeSamples) {
1394 // Once two node are merged due to promotion, we're losing some context
1395 // so the original context-sensitive preinliner decision should be ignored
1396 // for SyntheticContext.
1397 SampleContext &Context = Candidate.CalleeSamples->getContext();
1398 if (!Context.hasState(SyntheticContext) &&
1400 return InlineCost::getAlways("preinliner");
1401 }
1402
1403 // For old FDO inliner, we inline the call site if it is below hot threshold,
1404 // even if the function is hot based on sample profile data. This is to
1405 // prevent huge functions from being inlined.
1408 }
1409
1410 // Otherwise only use the cost from call analyzer, but overwite threshold with
1411 // Sample PGO threshold.
1412 return InlineCost::get(Cost.getCost(), SampleThreshold);
1413}
1414
1415bool SampleProfileLoader::inlineHotFunctionsWithPriority(
1416 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1417 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1418 // Profile symbol list is ignored when profile-sample-accurate is on.
1419 assert((!ProfAccForSymsInList ||
1421 !F.hasFnAttribute("profile-sample-accurate"))) &&
1422 "ProfAccForSymsInList should be false when profile-sample-accurate "
1423 "is enabled");
1424
1425 // Populating worklist with initial call sites from root inliner, along
1426 // with call site weights.
1427 CandidateQueue CQueue;
1428 InlineCandidate NewCandidate;
1429 for (auto &BB : F) {
1430 for (auto &I : BB) {
1431 auto *CB = dyn_cast<CallBase>(&I);
1432 if (!CB)
1433 continue;
1434 if (getInlineCandidate(&NewCandidate, CB))
1435 CQueue.push(NewCandidate);
1436 }
1437 }
1438
1439 // Cap the size growth from profile guided inlining. This is needed even
1440 // though cost of each inline candidate already accounts for callee size,
1441 // because with top-down inlining, we can grow inliner size significantly
1442 // with large number of smaller inlinees each pass the cost check.
1444 "Max inline size limit should not be smaller than min inline size "
1445 "limit.");
1446 unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;
1447 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
1448 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
1449 if (ExternalInlineAdvisor)
1450 SizeLimit = std::numeric_limits<unsigned>::max();
1451
1452 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1453
1454 // Perform iterative BFS call site prioritized inlining
1455 bool Changed = false;
1456 while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
1457 InlineCandidate Candidate = CQueue.top();
1458 CQueue.pop();
1459 CallBase *I = Candidate.CallInstr;
1460 Function *CalledFunction = I->getCalledFunction();
1461
1462 if (CalledFunction == &F)
1463 continue;
1464 if (I->isIndirectCall()) {
1465 uint64_t Sum = 0;
1466 auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
1467 uint64_t SumOrigin = Sum;
1468 Sum *= Candidate.CallsiteDistribution;
1469 unsigned ICPCount = 0;
1470 for (const auto *FS : CalleeSamples) {
1471 // TODO: Consider disable pre-lTO ICP for MonoLTO as well
1472 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1473 findExternalInlineCandidate(I, FS, InlinedGUIDs,
1474 PSI->getOrCompHotCountThreshold());
1475 continue;
1476 }
1477 uint64_t EntryCountDistributed =
1478 FS->getHeadSamplesEstimate() * Candidate.CallsiteDistribution;
1479 // In addition to regular inline cost check, we also need to make sure
1480 // ICP isn't introducing excessive speculative checks even if individual
1481 // target looks beneficial to promote and inline. That means we should
1482 // only do ICP when there's a small number dominant targets.
1483 if (ICPCount >= ProfileICPRelativeHotnessSkip &&
1484 EntryCountDistributed * 100 < SumOrigin * ProfileICPRelativeHotness)
1485 break;
1486 // TODO: Fix CallAnalyzer to handle all indirect calls.
1487 // For indirect call, we don't run CallAnalyzer to get InlineCost
1488 // before actual inlining. This is because we could see two different
1489 // types from the same definition, which makes CallAnalyzer choke as
1490 // it's expecting matching parameter type on both caller and callee
1491 // side. See example from PR18962 for the triggering cases (the bug was
1492 // fixed, but we generate different types).
1493 if (!PSI->isHotCount(EntryCountDistributed))
1494 break;
1495 SmallVector<CallBase *, 8> InlinedCallSites;
1496 // Attach function profile for promoted indirect callee, and update
1497 // call site count for the promoted inline candidate too.
1498 Candidate = {I, FS, EntryCountDistributed,
1499 Candidate.CallsiteDistribution};
1500 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
1501 &InlinedCallSites)) {
1502 for (auto *CB : InlinedCallSites) {
1503 if (getInlineCandidate(&NewCandidate, CB))
1504 CQueue.emplace(NewCandidate);
1505 }
1506 ICPCount++;
1507 Changed = true;
1508 } else if (!ContextTracker) {
1509 LocalNotInlinedCallSites.insert({I, FS});
1510 }
1511 }
1512 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1513 !CalledFunction->isDeclaration()) {
1514 SmallVector<CallBase *, 8> InlinedCallSites;
1515 if (tryInlineCandidate(Candidate, &InlinedCallSites)) {
1516 for (auto *CB : InlinedCallSites) {
1517 if (getInlineCandidate(&NewCandidate, CB))
1518 CQueue.emplace(NewCandidate);
1519 }
1520 Changed = true;
1521 } else if (!ContextTracker) {
1522 LocalNotInlinedCallSites.insert({I, Candidate.CalleeSamples});
1523 }
1524 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1525 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1526 InlinedGUIDs,
1527 PSI->getOrCompHotCountThreshold());
1528 }
1529 }
1530
1531 if (!CQueue.empty()) {
1532 if (SizeLimit == (unsigned)ProfileInlineLimitMax)
1533 ++NumCSInlinedHitMaxLimit;
1534 else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
1535 ++NumCSInlinedHitMinLimit;
1536 else
1537 ++NumCSInlinedHitGrowthLimit;
1538 }
1539
1540 // For CS profile, profile for not inlined context will be merged when
1541 // base profile is being retrieved.
1543 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1544 return Changed;
1545}
1546
1547void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
1549 const Function &F) {
1550 // Accumulate not inlined callsite information into notInlinedSamples
1551 for (const auto &Pair : NonInlinedCallSites) {
1552 CallBase *I = Pair.first;
1553 Function *Callee = I->getCalledFunction();
1554 if (!Callee || Callee->isDeclaration())
1555 continue;
1556
1557 ORE->emit(
1558 OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), "NotInline",
1559 I->getDebugLoc(), I->getParent())
1560 << "previous inlining not repeated: '" << ore::NV("Callee", Callee)
1561 << "' into '" << ore::NV("Caller", &F) << "'");
1562
1563 ++NumCSNotInlined;
1564 const FunctionSamples *FS = Pair.second;
1565 if (FS->getTotalSamples() == 0 && FS->getHeadSamplesEstimate() == 0) {
1566 continue;
1567 }
1568
1569 // Do not merge a context that is already duplicated into the base profile.
1570 if (FS->getContext().hasAttribute(sampleprof::ContextDuplicatedIntoBase))
1571 continue;
1572
1573 if (ProfileMergeInlinee) {
1574 // A function call can be replicated by optimizations like callsite
1575 // splitting or jump threading and the replicates end up sharing the
1576 // sample nested callee profile instead of slicing the original
1577 // inlinee's profile. We want to do merge exactly once by filtering out
1578 // callee profiles with a non-zero head sample count.
1579 if (FS->getHeadSamples() == 0) {
1580 // Use entry samples as head samples during the merge, as inlinees
1581 // don't have head samples.
1582 const_cast<FunctionSamples *>(FS)->addHeadSamples(
1583 FS->getHeadSamplesEstimate());
1584
1585 // Note that we have to do the merge right after processing function.
1586 // This allows OutlineFS's profile to be used for annotation during
1587 // top-down processing of functions' annotation.
1588 FunctionSamples *OutlineFS = Reader->getSamplesFor(*Callee);
1589 // If outlined function does not exist in the profile, add it to a
1590 // separate map so that it does not rehash the original profile.
1591 if (!OutlineFS)
1592 OutlineFS = &OutlineFunctionSamples[
1594 OutlineFS->merge(*FS, 1);
1595 // Set outlined profile to be synthetic to not bias the inliner.
1596 OutlineFS->setContextSynthetic();
1597 }
1598 } else {
1599 auto pair =
1600 notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
1601 pair.first->second.entryCount += FS->getHeadSamplesEstimate();
1602 }
1603 }
1604}
1605
1606/// Returns the sorted CallTargetMap \p M by count in descending order.
1610 for (const auto &I : SampleRecord::sortCallTargets(M)) {
1611 R.emplace_back(
1612 InstrProfValueData{I.first.getHashCode(), I.second});
1613 }
1614 return R;
1615}
1616
1617// Generate MD_prof metadata for every branch instruction using the
1618// edge weights computed during propagation.
1619void SampleProfileLoader::generateMDProfMetadata(Function &F) {
1620 // Generate MD_prof metadata for every branch instruction using the
1621 // edge weights computed during propagation.
1622 LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
1623 LLVMContext &Ctx = F.getContext();
1624 MDBuilder MDB(Ctx);
1625 for (auto &BI : F) {
1626 BasicBlock *BB = &BI;
1627
1628 if (BlockWeights[BB]) {
1629 for (auto &I : *BB) {
1630 if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
1631 continue;
1632 if (!cast<CallBase>(I).getCalledFunction()) {
1633 const DebugLoc &DLoc = I.getDebugLoc();
1634 if (!DLoc)
1635 continue;
1636 const DILocation *DIL = DLoc;
1637 const FunctionSamples *FS = findFunctionSamples(I);
1638 if (!FS)
1639 continue;
1642 FS->findCallTargetMapAt(CallSite);
1643 if (!T || T.get().empty())
1644 continue;
1646 // Prorate the callsite counts based on the pre-ICP distribution
1647 // factor to reflect what is already done to the callsite before
1648 // ICP, such as calliste cloning.
1649 if (std::optional<PseudoProbe> Probe = extractProbe(I)) {
1650 if (Probe->Factor < 1)
1651 T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
1652 }
1653 }
1654 SmallVector<InstrProfValueData, 2> SortedCallTargets =
1656 uint64_t Sum = 0;
1657 for (const auto &C : T.get())
1658 Sum += C.second;
1659 // With CSSPGO all indirect call targets are counted torwards the
1660 // original indirect call site in the profile, including both
1661 // inlined and non-inlined targets.
1663 if (const FunctionSamplesMap *M =
1664 FS->findFunctionSamplesMapAt(CallSite)) {
1665 for (const auto &NameFS : *M)
1666 Sum += NameFS.second.getHeadSamplesEstimate();
1667 }
1668 }
1669 if (Sum)
1670 updateIDTMetaData(I, SortedCallTargets, Sum);
1671 else if (OverwriteExistingWeights)
1672 I.setMetadata(LLVMContext::MD_prof, nullptr);
1673 } else if (!isa<IntrinsicInst>(&I)) {
1674 setBranchWeights(I, {static_cast<uint32_t>(BlockWeights[BB])},
1675 /*IsExpected=*/false);
1676 }
1677 }
1679 // Set profile metadata (possibly annotated by LTO prelink) to zero or
1680 // clear it for cold code.
1681 for (auto &I : *BB) {
1682 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1683 if (cast<CallBase>(I).isIndirectCall()) {
1684 I.setMetadata(LLVMContext::MD_prof, nullptr);
1685 } else {
1686 setBranchWeights(I, {uint32_t(0)}, /*IsExpected=*/false);
1687 }
1688 }
1689 }
1690 }
1691
1692 Instruction *TI = BB->getTerminator();
1693 if (TI->getNumSuccessors() == 1)
1694 continue;
1695 if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI) &&
1696 !isa<IndirectBrInst>(TI))
1697 continue;
1698
1699 DebugLoc BranchLoc = TI->getDebugLoc();
1700 LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
1701 << ((BranchLoc) ? Twine(BranchLoc.getLine())
1702 : Twine("<UNKNOWN LOCATION>"))
1703 << ".\n");
1705 uint32_t MaxWeight = 0;
1706 Instruction *MaxDestInst;
1707 // Since profi treats multiple edges (multiway branches) as a single edge,
1708 // we need to distribute the computed weight among the branches. We do
1709 // this by evenly splitting the edge weight among destinations.
1711 std::vector<uint64_t> EdgeIndex;
1713 EdgeIndex.resize(TI->getNumSuccessors());
1714 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1715 const BasicBlock *Succ = TI->getSuccessor(I);
1716 EdgeIndex[I] = EdgeMultiplicity[Succ];
1717 EdgeMultiplicity[Succ]++;
1718 }
1719 }
1720 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1721 BasicBlock *Succ = TI->getSuccessor(I);
1722 Edge E = std::make_pair(BB, Succ);
1723 uint64_t Weight = EdgeWeights[E];
1724 LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
1725 // Use uint32_t saturated arithmetic to adjust the incoming weights,
1726 // if needed. Sample counts in profiles are 64-bit unsigned values,
1727 // but internally branch weights are expressed as 32-bit values.
1728 if (Weight > std::numeric_limits<uint32_t>::max()) {
1729 LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)\n");
1730 Weight = std::numeric_limits<uint32_t>::max();
1731 }
1732 if (!SampleProfileUseProfi) {
1733 // Weight is added by one to avoid propagation errors introduced by
1734 // 0 weights.
1735 Weights.push_back(static_cast<uint32_t>(
1736 Weight == std::numeric_limits<uint32_t>::max() ? Weight
1737 : Weight + 1));
1738 } else {
1739 // Profi creates proper weights that do not require "+1" adjustments but
1740 // we evenly split the weight among branches with the same destination.
1741 uint64_t W = Weight / EdgeMultiplicity[Succ];
1742 // Rounding up, if needed, so that first branches are hotter.
1743 if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ])
1744 W++;
1745 Weights.push_back(static_cast<uint32_t>(W));
1746 }
1747 if (Weight != 0) {
1748 if (Weight > MaxWeight) {
1749 MaxWeight = Weight;
1750 MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
1751 }
1752 }
1753 }
1754
1755 misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);
1756
1757 uint64_t TempWeight;
1758 // Only set weights if there is at least one non-zero weight.
1759 // In any other case, let the analyzer set weights.
1760 // Do not set weights if the weights are present unless under
1761 // OverwriteExistingWeights. In ThinLTO, the profile annotation is done
1762 // twice. If the first annotation already set the weights, the second pass
1763 // does not need to set it. With OverwriteExistingWeights, Blocks with zero
1764 // weight should have their existing metadata (possibly annotated by LTO
1765 // prelink) cleared.
1766 if (MaxWeight > 0 &&
1767 (!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) {
1768 LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
1769 setBranchWeights(*TI, Weights, /*IsExpected=*/false);
1770 ORE->emit([&]() {
1771 return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
1772 << "most popular destination for conditional branches at "
1773 << ore::NV("CondBranchesLoc", BranchLoc);
1774 });
1775 } else {
1777 TI->setMetadata(LLVMContext::MD_prof, nullptr);
1778 LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n");
1779 } else {
1780 LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
1781 }
1782 }
1783 }
1784}
1785
1786/// Once all the branch weights are computed, we emit the MD_prof
1787/// metadata on BB using the computed values for each of its branches.
1788///
1789/// \param F The function to query.
1790///
1791/// \returns true if \p F was modified. Returns false, otherwise.
1792bool SampleProfileLoader::emitAnnotations(Function &F) {
1793 bool Changed = false;
1794
1796 LLVM_DEBUG({
1797 if (!ProbeManager->getDesc(F))
1798 dbgs() << "Probe descriptor missing for Function " << F.getName()
1799 << "\n";
1800 });
1801
1802 if (ProbeManager->profileIsValid(F, *Samples)) {
1803 ++NumMatchedProfile;
1804 } else {
1805 ++NumMismatchedProfile;
1806 LLVM_DEBUG(
1807 dbgs() << "Profile is invalid due to CFG mismatch for Function "
1808 << F.getName() << "\n");
1810 return false;
1811 }
1812 } else {
1813 if (getFunctionLoc(F) == 0)
1814 return false;
1815
1816 LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
1817 << F.getName() << ": " << getFunctionLoc(F) << "\n");
1818 }
1819
1820 DenseSet<GlobalValue::GUID> InlinedGUIDs;
1822 Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
1823 else
1824 Changed |= inlineHotFunctions(F, InlinedGUIDs);
1825
1826 Changed |= computeAndPropagateWeights(F, InlinedGUIDs);
1827
1828 if (Changed)
1829 generateMDProfMetadata(F);
1830
1831 emitCoverageRemarks(F);
1832 return Changed;
1833}
1834
1835std::unique_ptr<ProfiledCallGraph>
1836SampleProfileLoader::buildProfiledCallGraph(Module &M) {
1837 std::unique_ptr<ProfiledCallGraph> ProfiledCG;
1839 ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker);
1840 else
1841 ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles());
1842
1843 // Add all functions into the profiled call graph even if they are not in
1844 // the profile. This makes sure functions missing from the profile still
1845 // gets a chance to be processed.
1846 for (Function &F : M) {
1848 continue;
1849 ProfiledCG->addProfiledFunction(
1851 }
1852
1853 return ProfiledCG;
1854}
1855
1856std::vector<Function *>
1857SampleProfileLoader::buildFunctionOrder(Module &M, LazyCallGraph &CG) {
1858 std::vector<Function *> FunctionOrderList;
1859 FunctionOrderList.reserve(M.size());
1860
1862 errs() << "WARNING: -use-profiled-call-graph ignored, should be used "
1863 "together with -sample-profile-top-down-load.\n";
1864
1865 if (!ProfileTopDownLoad) {
1866 if (ProfileMergeInlinee) {
1867 // Disable ProfileMergeInlinee if profile is not loaded in top down order,
1868 // because the profile for a function may be used for the profile
1869 // annotation of its outline copy before the profile merging of its
1870 // non-inlined inline instances, and that is not the way how
1871 // ProfileMergeInlinee is supposed to work.
1872 ProfileMergeInlinee = false;
1873 }
1874
1875 for (Function &F : M)
1877 FunctionOrderList.push_back(&F);
1878 return FunctionOrderList;
1879 }
1880
1883 // Use profiled call edges to augment the top-down order. There are cases
1884 // that the top-down order computed based on the static call graph doesn't
1885 // reflect real execution order. For example
1886 //
1887 // 1. Incomplete static call graph due to unknown indirect call targets.
1888 // Adjusting the order by considering indirect call edges from the
1889 // profile can enable the inlining of indirect call targets by allowing
1890 // the caller processed before them.
1891 // 2. Mutual call edges in an SCC. The static processing order computed for
1892 // an SCC may not reflect the call contexts in the context-sensitive
1893 // profile, thus may cause potential inlining to be overlooked. The
1894 // function order in one SCC is being adjusted to a top-down order based
1895 // on the profile to favor more inlining. This is only a problem with CS
1896 // profile.
1897 // 3. Transitive indirect call edges due to inlining. When a callee function
1898 // (say B) is inlined into a caller function (say A) in LTO prelink,
1899 // every call edge originated from the callee B will be transferred to
1900 // the caller A. If any transferred edge (say A->C) is indirect, the
1901 // original profiled indirect edge B->C, even if considered, would not
1902 // enforce a top-down order from the caller A to the potential indirect
1903 // call target C in LTO postlink since the inlined callee B is gone from
1904 // the static call graph.
1905 // 4. #3 can happen even for direct call targets, due to functions defined
1906 // in header files. A header function (say A), when included into source
1907 // files, is defined multiple times but only one definition survives due
1908 // to ODR. Therefore, the LTO prelink inlining done on those dropped
1909 // definitions can be useless based on a local file scope. More
1910 // importantly, the inlinee (say B), once fully inlined to a
1911 // to-be-dropped A, will have no profile to consume when its outlined
1912 // version is compiled. This can lead to a profile-less prelink
1913 // compilation for the outlined version of B which may be called from
1914 // external modules. while this isn't easy to fix, we rely on the
1915 // postlink AutoFDO pipeline to optimize B. Since the survived copy of
1916 // the A can be inlined in its local scope in prelink, it may not exist
1917 // in the merged IR in postlink, and we'll need the profiled call edges
1918 // to enforce a top-down order for the rest of the functions.
1919 //
1920 // Considering those cases, a profiled call graph completely independent of
1921 // the static call graph is constructed based on profile data, where
1922 // function objects are not even needed to handle case #3 and case 4.
1923 //
1924 // Note that static callgraph edges are completely ignored since they
1925 // can be conflicting with profiled edges for cyclic SCCs and may result in
1926 // an SCC order incompatible with profile-defined one. Using strictly
1927 // profile order ensures a maximum inlining experience. On the other hand,
1928 // static call edges are not so important when they don't correspond to a
1929 // context in the profile.
1930
1931 std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(M);
1932 scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());
1933 while (!CGI.isAtEnd()) {
1934 auto Range = *CGI;
1935 if (SortProfiledSCC) {
1936 // Sort nodes in one SCC based on callsite hotness.
1938 Range = *SI;
1939 }
1940 for (auto *Node : Range) {
1941 Function *F = SymbolMap.lookup(Node->Name);
1942 if (F && !skipProfileForFunction(*F))
1943 FunctionOrderList.push_back(F);
1944 }
1945 ++CGI;
1946 }
1947 std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
1948 } else
1949 buildTopDownFuncOrder(CG, FunctionOrderList);
1950
1951 LLVM_DEBUG({
1952 dbgs() << "Function processing order:\n";
1953 for (auto F : FunctionOrderList) {
1954 dbgs() << F->getName() << "\n";
1955 }
1956 });
1957
1958 return FunctionOrderList;
1959}
1960
1961bool SampleProfileLoader::doInitialization(Module &M,
1963 auto &Ctx = M.getContext();
1964
1965 auto ReaderOrErr = SampleProfileReader::create(
1966 Filename, Ctx, *FS, FSDiscriminatorPass::Base, RemappingFilename);
1967 if (std::error_code EC = ReaderOrErr.getError()) {
1968 std::string Msg = "Could not open profile: " + EC.message();
1969 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1970 return false;
1971 }
1972 Reader = std::move(ReaderOrErr.get());
1973 Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink);
1974 // set module before reading the profile so reader may be able to only
1975 // read the function profiles which are used by the current module.
1976 Reader->setModule(&M);
1977 if (std::error_code EC = Reader->read()) {
1978 std::string Msg = "profile reading failed: " + EC.message();
1979 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1980 return false;
1981 }
1982
1983 PSL = Reader->getProfileSymbolList();
1984
1986 DisableSampleProfileInlining = DisableSampleLoaderInlining;
1987
1988 if (UseFlattenedProfile)
1989 ProfileConverter::flattenProfile(Reader->getProfiles(),
1990 Reader->profileIsCS());
1991
1992 // While profile-sample-accurate is on, ignore symbol list.
1993 ProfAccForSymsInList =
1995 if (ProfAccForSymsInList) {
1996 NamesInProfile.clear();
1997 GUIDsInProfile.clear();
1998 if (auto NameTable = Reader->getNameTable()) {
2000 for (auto Name : *NameTable)
2001 GUIDsInProfile.insert(Name.getHashCode());
2002 } else {
2003 for (auto Name : *NameTable)
2004 NamesInProfile.insert(Name.stringRef());
2005 }
2006 }
2007 CoverageTracker.setProfAccForSymsInList(true);
2008 }
2009
2010 if (FAM && !ProfileInlineReplayFile.empty()) {
2011 ExternalInlineAdvisor = getReplayInlineAdvisor(
2012 M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr,
2017 /*EmitRemarks=*/false, InlineContext{LTOPhase, InlinePass::ReplaySampleProfileInliner});
2018 }
2019
2020 // Apply tweaks if context-sensitive or probe-based profile is available.
2021 if (Reader->profileIsCS() || Reader->profileIsPreInlined() ||
2022 Reader->profileIsProbeBased()) {
2026 SampleProfileUseProfi = true;
2029 // Enable priority-base inliner and size inline by default for CSSPGO.
2031 ProfileSizeInline = true;
2034 // For CSSPGO, we also allow recursive inline to best use context profile.
2036 AllowRecursiveInline = true;
2037
2038 if (Reader->profileIsPreInlined()) {
2040 UsePreInlinerDecision = true;
2041 }
2042
2043 // Enable stale profile matching by default for probe-based profile.
2044 // Currently the matching relies on if the checksum mismatch is detected,
2045 // which is currently only available for pseudo-probe mode. Removing the
2046 // checksum check could cause regressions for some cases, so further tuning
2047 // might be needed if we want to enable it for all cases.
2048 if (Reader->profileIsProbeBased() &&
2050 SalvageStaleProfile = true;
2051 }
2052
2053 if (!Reader->profileIsCS()) {
2054 // Non-CS profile should be fine without a function size budget for the
2055 // inliner since the contexts in the profile are either all from inlining
2056 // in the prevoius build or pre-computed by the preinliner with a size
2057 // cap, thus they are bounded.
2058 if (!ProfileInlineLimitMin.getNumOccurrences())
2059 ProfileInlineLimitMin = std::numeric_limits<unsigned>::max();
2060 if (!ProfileInlineLimitMax.getNumOccurrences())
2061 ProfileInlineLimitMax = std::numeric_limits<unsigned>::max();
2062 }
2063 }
2064
2065 if (Reader->profileIsCS()) {
2066 // Tracker for profiles under different context
2067 ContextTracker = std::make_unique<SampleContextTracker>(
2068 Reader->getProfiles(), &GUIDToFuncNameMap);
2069 }
2070
2071 // Load pseudo probe descriptors for probe-based function samples.
2072 if (Reader->profileIsProbeBased()) {
2073 ProbeManager = std::make_unique<PseudoProbeManager>(M);
2074 if (!ProbeManager->moduleIsProbed(M)) {
2075 const char *Msg =
2076 "Pseudo-probe-based profile requires SampleProfileProbePass";
2077 Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg,
2078 DS_Warning));
2079 return false;
2080 }
2081 }
2082
2085 MatchingManager = std::make_unique<SampleProfileMatcher>(
2086 M, *Reader, CG, ProbeManager.get(), LTOPhase, SymbolMap, PSL,
2087 FuncNameToProfNameMap);
2088 }
2089
2090 return true;
2091}
2092
2093// Note that this is a module-level check. Even if one module is errored out,
2094// the entire build will be errored out. However, the user could make big
2095// changes to functions in single module but those changes might not be
2096// performance significant to the whole binary. Therefore, to avoid those false
2097// positives, we select a reasonable big set of hot functions that are supposed
2098// to be globally performance significant, only compute and check the mismatch
2099// within those functions. The function selection is based on two criteria:
2100// 1) The function is hot enough, which is tuned by a hotness-based
2101// flag(HotFuncCutoffForStalenessError). 2) The num of function is large enough
2102// which is tuned by the MinfuncsForStalenessError flag.
2103bool SampleProfileLoader::rejectHighStalenessProfile(
2104 Module &M, ProfileSummaryInfo *PSI, const SampleProfileMap &Profiles) {
2106 "Only support for probe-based profile");
2107 uint64_t TotalHotFunc = 0;
2108 uint64_t NumMismatchedFunc = 0;
2109 for (const auto &I : Profiles) {
2110 const auto &FS = I.second;
2111 const auto *FuncDesc = ProbeManager->getDesc(FS.getGUID());
2112 if (!FuncDesc)
2113 continue;
2114
2115 // Use a hotness-based threshold to control the function selection.
2117 FS.getTotalSamples()))
2118 continue;
2119
2120 TotalHotFunc++;
2121 if (ProbeManager->profileIsHashMismatched(*FuncDesc, FS))
2122 NumMismatchedFunc++;
2123 }
2124 // Make sure that the num of selected function is not too small to distinguish
2125 // from the user's benign changes.
2126 if (TotalHotFunc < MinfuncsForStalenessError)
2127 return false;
2128
2129 // Finally check the mismatch percentage against the threshold.
2130 if (NumMismatchedFunc * 100 >=
2131 TotalHotFunc * PrecentMismatchForStalenessError) {
2132 auto &Ctx = M.getContext();
2133 const char *Msg =
2134 "The input profile significantly mismatches current source code. "
2135 "Please recollect profile to avoid performance regression.";
2136 Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg));
2137 return true;
2138 }
2139 return false;
2140}
2141
2142void SampleProfileLoader::removePseudoProbeInstsDiscriminator(Module &M) {
2143 for (auto &F : M) {
2144 std::vector<Instruction *> InstsToDel;
2145 for (auto &BB : F) {
2146 for (auto &I : BB) {
2147 if (isa<PseudoProbeInst>(&I))
2148 InstsToDel.push_back(&I);
2149 else if (isa<CallBase>(&I))
2150 if (const DILocation *DIL = I.getDebugLoc().get()) {
2151 // Restore dwarf discriminator for call.
2152 unsigned Discriminator = DIL->getDiscriminator();
2153 if (DILocation::isPseudoProbeDiscriminator(Discriminator)) {
2154 std::optional<uint32_t> DwarfDiscriminator =
2156 Discriminator);
2157 I.setDebugLoc(DIL->cloneWithDiscriminator(
2158 DwarfDiscriminator ? *DwarfDiscriminator : 0));
2159 }
2160 }
2161 }
2162 }
2163 for (auto *I : InstsToDel)
2164 I->eraseFromParent();
2165 }
2166}
2167
2168bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
2169 ProfileSummaryInfo *_PSI) {
2170 GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
2171
2172 PSI = _PSI;
2173 if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
2174 M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
2176 PSI->refresh();
2177 }
2178
2180 rejectHighStalenessProfile(M, PSI, Reader->getProfiles()))
2181 return false;
2182
2183 // Compute the total number of samples collected in this profile.
2184 for (const auto &I : Reader->getProfiles())
2185 TotalCollectedSamples += I.second.getTotalSamples();
2186
2187 auto Remapper = Reader->getRemapper();
2188 // Populate the symbol map.
2189 for (const auto &N_F : M.getValueSymbolTable()) {
2190 StringRef OrigName = N_F.getKey();
2191 Function *F = dyn_cast<Function>(N_F.getValue());
2192 if (F == nullptr || OrigName.empty())
2193 continue;
2194 SymbolMap[FunctionId(OrigName)] = F;
2196 if (OrigName != NewName && !NewName.empty()) {
2197 auto r = SymbolMap.emplace(FunctionId(NewName), F);
2198 // Failiing to insert means there is already an entry in SymbolMap,
2199 // thus there are multiple functions that are mapped to the same
2200 // stripped name. In this case of name conflicting, set the value
2201 // to nullptr to avoid confusion.
2202 if (!r.second)
2203 r.first->second = nullptr;
2204 OrigName = NewName;
2205 }
2206 // Insert the remapped names into SymbolMap.
2207 if (Remapper) {
2208 if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
2209 if (*MapName != OrigName && !MapName->empty())
2210 SymbolMap.emplace(FunctionId(*MapName), F);
2211 }
2212 }
2213 }
2214
2215 // Stale profile matching.
2218 MatchingManager->runOnModule();
2219 MatchingManager->clearMatchingData();
2220 }
2221 assert(SymbolMap.count(FunctionId()) == 0 &&
2222 "No empty StringRef should be added in SymbolMap");
2223 assert((SalvageUnusedProfile || FuncNameToProfNameMap.empty()) &&
2224 "FuncNameToProfNameMap is not empty when --salvage-unused-profile is "
2225 "not enabled");
2226
2227 bool retval = false;
2228 for (auto *F : buildFunctionOrder(M, CG)) {
2229 assert(!F->isDeclaration());
2230 clearFunctionData();
2231 retval |= runOnFunction(*F, AM);
2232 }
2233
2234 // Account for cold calls not inlined....
2236 for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
2237 notInlinedCallInfo)
2238 updateProfileCallee(pair.first, pair.second.entryCount);
2239
2242 removePseudoProbeInstsDiscriminator(M);
2243 if (auto *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName))
2244 M.eraseNamedMetadata(FuncInfo);
2245 }
2246
2247 return retval;
2248}
2249
2250bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
2251 LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
2252 DILocation2SampleMap.clear();
2253 // By default the entry count is initialized to -1, which will be treated
2254 // conservatively by getEntryCount as the same as unknown (None). This is
2255 // to avoid newly added code to be treated as cold. If we have samples
2256 // this will be overwritten in emitAnnotations.
2257 uint64_t initialEntryCount = -1;
2258
2259 ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
2260 if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
2261 // initialize all the function entry counts to 0. It means all the
2262 // functions without profile will be regarded as cold.
2263 initialEntryCount = 0;
2264 // profile-sample-accurate is a user assertion which has a higher precedence
2265 // than symbol list. When profile-sample-accurate is on, ignore symbol list.
2266 ProfAccForSymsInList = false;
2267 }
2268 CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList);
2269
2270 // PSL -- profile symbol list include all the symbols in sampled binary.
2271 // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
2272 // old functions without samples being cold, without having to worry
2273 // about new and hot functions being mistakenly treated as cold.
2274 if (ProfAccForSymsInList) {
2275 // Initialize the entry count to 0 for functions in the list.
2276 if (PSL->contains(F.getName()))
2277 initialEntryCount = 0;
2278
2279 // Function in the symbol list but without sample will be regarded as
2280 // cold. To minimize the potential negative performance impact it could
2281 // have, we want to be a little conservative here saying if a function
2282 // shows up in the profile, no matter as outline function, inline instance
2283 // or call targets, treat the function as not being cold. This will handle
2284 // the cases such as most callsites of a function are inlined in sampled
2285 // binary but not inlined in current build (because of source code drift,
2286 // imprecise debug information, or the callsites are all cold individually
2287 // but not cold accumulatively...), so the outline function showing up as
2288 // cold in sampled binary will actually not be cold after current build.
2291 GUIDsInProfile.count(Function::getGUID(CanonName))) ||
2292 (!FunctionSamples::UseMD5 && NamesInProfile.count(CanonName)))
2293 initialEntryCount = -1;
2294 }
2295
2296 // Initialize entry count when the function has no existing entry
2297 // count value.
2298 if (!F.getEntryCount())
2299 F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
2300 std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
2301 if (AM) {
2302 auto &FAM =
2304 .getManager();
2306 } else {
2307 OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
2308 ORE = OwnedORE.get();
2309 }
2310
2312 Samples = ContextTracker->getBaseSamplesFor(F);
2313 else {
2314 Samples = Reader->getSamplesFor(F);
2315 // Try search in previously inlined functions that were split or duplicated
2316 // into base.
2317 if (!Samples) {
2319 auto It = OutlineFunctionSamples.find(FunctionId(CanonName));
2320 if (It != OutlineFunctionSamples.end()) {
2321 Samples = &It->second;
2322 } else if (auto Remapper = Reader->getRemapper()) {
2323 if (auto RemppedName = Remapper->lookUpNameInProfile(CanonName)) {
2324 It = OutlineFunctionSamples.find(FunctionId(*RemppedName));
2325 if (It != OutlineFunctionSamples.end())
2326 Samples = &It->second;
2327 }
2328 }
2329 }
2330 }
2331
2332 if (Samples && !Samples->empty())
2333 return emitAnnotations(F);
2334 return false;
2335}
2337 std::string File, std::string RemappingFile, ThinOrFullLTOPhase LTOPhase,
2338 IntrusiveRefCntPtr<vfs::FileSystem> FS, bool DisableSampleProfileInlining,
2339 bool UseFlattenedProfile)
2340 : ProfileFileName(File), ProfileRemappingFileName(RemappingFile),
2341 LTOPhase(LTOPhase), FS(std::move(FS)),
2342 DisableSampleProfileInlining(DisableSampleProfileInlining),
2343 UseFlattenedProfile(UseFlattenedProfile) {}
2344
2349
2350 auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
2352 };
2353 auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
2355 };
2356 auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
2358 };
2359
2360 if (!FS)
2363
2364 SampleProfileLoader SampleLoader(
2365 ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
2366 ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
2367 : ProfileRemappingFileName,
2368 LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI, CG,
2369 DisableSampleProfileInlining, UseFlattenedProfile);
2370 if (!SampleLoader.doInitialization(M, &FAM))
2371 return PreservedAnalyses::all();
2372
2374 if (!SampleLoader.runOnModule(M, &AM, PSI))
2375 return PreservedAnalyses::all();
2376
2377 return PreservedAnalyses::none();
2378}
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
static bool runOnFunction(Function &F, bool PostInlining)
Provides ErrorOr<T> smart pointer.
static cl::opt< unsigned > SizeLimit("eif-limit", cl::init(6), cl::Hidden, cl::desc("Size limit in Hexagon early if-conversion"))
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
LVReader * CurrentReader
Definition: LVReader.cpp:152
Implements a lazy call graph analysis and related passes for the new pass manager.
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file implements a map that provides insertion order iteration.
static const Function * getCalledFunction(const Value *V)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
This file defines the PriorityQueue class.
This file contains the declarations for profiling metadata utility functions.
This builds on the llvm/ADT/GraphTraits.h file to find the strongly connected components (SCCs) of a ...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file provides the interface for context-sensitive profile tracker used by CSSPGO.
This file provides the interface for the sampled PGO profile loader base implementation.
This file provides the utility functions for the sampled PGO loader base implementation.
This file provides the interface for SampleProfileMatcher.
This file provides the interface for the pseudo probe implementation for AutoFDO.
static cl::opt< std::string > SampleProfileFile("sample-profile-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile file loaded by -sample-profile"), cl::Hidden)
static cl::opt< unsigned > MinfuncsForStalenessError("min-functions-for-staleness-error", cl::Hidden, cl::init(50), cl::desc("Skip the check if the number of hot functions is smaller than " "the specified number."))
cl::opt< bool > SalvageUnusedProfile("salvage-unused-profile", cl::Hidden, cl::init(false), cl::desc("Salvage unused profile by matching with new " "functions on call graph."))
static cl::opt< bool > ProfileSampleBlockAccurate("profile-sample-block-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "branches and calls as having 0 samples. Otherwise, treat " "them conservatively as unknown. "))
static cl::opt< unsigned > PrecentMismatchForStalenessError("precent-mismatch-for-staleness-error", cl::Hidden, cl::init(80), cl::desc("Reject the profile if the mismatch percent is higher than the " "given number."))
static cl::opt< bool > DisableSampleLoaderInlining("disable-sample-loader-inlining", cl::Hidden, cl::init(false), cl::desc("If true, artificially skip inline transformation in sample-loader " "pass, and merge (or scale) profiles (as configured by " "--sample-profile-merge-inlinee)."))
static cl::opt< bool > RemoveProbeAfterProfileAnnotation("sample-profile-remove-probe", cl::Hidden, cl::init(false), cl::desc("Remove pseudo-probe after sample profile annotation."))
static cl::opt< unsigned > MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden, cl::desc("Max number of promotions for a single indirect " "call callsite in sample profile loader"))
static cl::opt< ReplayInlinerSettings::Fallback > ProfileInlineReplayFallback("sample-profile-inline-replay-fallback", cl::init(ReplayInlinerSettings::Fallback::Original), cl::values(clEnumValN(ReplayInlinerSettings::Fallback::Original, "Original", "All decisions not in replay send to original advisor (default)"), clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline, "AlwaysInline", "All decisions not in replay are inlined"), clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline", "All decisions not in replay are not inlined")), cl::desc("How sample profile inline replay treats sites that don't come " "from the replay. Original: defers to original advisor, " "AlwaysInline: inline all sites not in replay, NeverInline: " "inline no sites not in replay"), cl::Hidden)
static cl::opt< bool > OverwriteExistingWeights("overwrite-existing-weights", cl::Hidden, cl::init(false), cl::desc("Ignore existing branch weights on IR and always overwrite."))
static void updateIDTMetaData(Instruction &Inst, const SmallVectorImpl< InstrProfValueData > &CallTargets, uint64_t Sum)
Update indirect call target profile metadata for Inst.
static cl::opt< bool > AnnotateSampleProfileInlinePhase("annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false), cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for " "sample-profile inline pass name."))
static cl::opt< std::string > ProfileInlineReplayFile("sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"), cl::desc("Optimization remarks file containing inline remarks to be replayed " "by inlining from sample profile loader."), cl::Hidden)
static cl::opt< bool > ProfileAccurateForSymsInList("profile-accurate-for-symsinlist", cl::Hidden, cl::init(true), cl::desc("For symbols in profile symbol list, regard their profiles to " "be accurate. It may be overridden by profile-sample-accurate. "))
static cl::opt< bool > ProfileMergeInlinee("sample-profile-merge-inlinee", cl::Hidden, cl::init(true), cl::desc("Merge past inlinee's profile to outline version if sample " "profile loader decided not to inline a call site. It will " "only be enabled when top-down order of profile loading is " "enabled. "))
cl::opt< bool > PersistProfileStaleness("persist-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute stale profile statistical metrics and write it into the " "native object file(.llvm_stats section)."))
static cl::opt< bool > CallsitePrioritizedInline("sample-profile-prioritized-inline", cl::Hidden, cl::desc("Use call site prioritized inlining for sample profile loader. " "Currently only CSSPGO is supported."))
static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate)
Check whether the indirect call promotion history of Inst allows the promotion for Candidate.
static SmallVector< InstrProfValueData, 2 > GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M)
Returns the sorted CallTargetMap M by count in descending order.
#define CSINLINE_DEBUG
static cl::opt< bool > UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden, cl::desc("Process functions in a top-down order " "defined by the profiled call graph when " "-sample-profile-top-down-load is on."))
static cl::opt< ReplayInlinerSettings::Scope > ProfileInlineReplayScope("sample-profile-inline-replay-scope", cl::init(ReplayInlinerSettings::Scope::Function), cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function", "Replay on functions that have remarks associated " "with them (default)"), clEnumValN(ReplayInlinerSettings::Scope::Module, "Module", "Replay on the entire module")), cl::desc("Whether inline replay should be applied to the entire " "Module or just the Functions (default) that are present as " "callers in remarks during sample profile inlining."), cl::Hidden)
static cl::opt< unsigned > ProfileICPRelativeHotness("sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25), cl::desc("Relative hotness percentage threshold for indirect " "call promotion in proirity-based sample profile loader inlining."))
Function::ProfileCount ProfileCount
static cl::opt< unsigned > ProfileICPRelativeHotnessSkip("sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1), cl::desc("Skip relative hotness check for ICP up to given number of targets."))
cl::opt< bool > ReportProfileStaleness("report-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute and report stale profile statistical metrics."))
static cl::opt< bool > UsePreInlinerDecision("sample-profile-use-preinliner", cl::Hidden, cl::desc("Use the preinliner decisions stored in profile context."))
#define DEBUG_TYPE
static cl::opt< bool > ProfileSizeInline("sample-profile-inline-size", cl::Hidden, cl::init(false), cl::desc("Inline cold call sites in profile loader if it's beneficial " "for code size."))
cl::opt< bool > SalvageStaleProfile("salvage-stale-profile", cl::Hidden, cl::init(false), cl::desc("Salvage stale profile by fuzzy matching and use the remapped " "location for sample profile query."))
static cl::opt< bool > ProfileTopDownLoad("sample-profile-top-down-load", cl::Hidden, cl::init(true), cl::desc("Do profile annotation and inlining for functions in top-down " "order of call graph during sample profile loading. It only " "works for new pass manager. "))
static cl::opt< bool > ProfileSampleAccurate("profile-sample-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "callsite and function as having 0 samples. Otherwise, treat " "un-sampled callsites and functions conservatively as unknown. "))
static cl::opt< bool > AllowRecursiveInline("sample-profile-recursive-inline", cl::Hidden, cl::desc("Allow sample loader inliner to inline recursive calls."))
static cl::opt< CallSiteFormat::Format > ProfileInlineReplayFormat("sample-profile-inline-replay-format", cl::init(CallSiteFormat::Format::LineColumnDiscriminator), cl::values(clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"), clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn", "<Line Number>:<Column Number>"), clEnumValN(CallSiteFormat::Format::LineDiscriminator, "LineDiscriminator", "<Line Number>.<Discriminator>"), clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator, "LineColumnDiscriminator", "<Line Number>:<Column Number>.<Discriminator> (default)")), cl::desc("How sample profile inline replay file is formatted"), cl::Hidden)
static cl::opt< std::string > SampleProfileRemappingFile("sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden)
static cl::opt< unsigned > HotFuncCutoffForStalenessError("hot-func-cutoff-for-staleness-error", cl::Hidden, cl::init(800000), cl::desc("A function is considered hot for staleness error check if its " "total sample count is above the specified percentile"))
This file provides the interface for the sampled PGO loader pass.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
This pass exposes codegen information to IR-level passes.
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1349
This class represents a function call, abstracting a target machine's calling convention.
Debug location.
static bool isPseudoProbeDiscriminator(unsigned Discriminator)
const DILocation * cloneWithDiscriminator(unsigned Discriminator) const
Returns a new DILocation with updated Discriminator.
A debug info location.
Definition: DebugLoc.h:33
unsigned getLine() const
Definition: DebugLoc.cpp:24
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
Diagnostic information for the sample profiler.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Represents either an error or a value T.
Definition: ErrorOr.h:56
Class to represent profile counts.
Definition: Function.h:292
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1874
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:296
Represents the cost of inlining a function.
Definition: InlineCost.h:89
static InlineCost getNever(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:130
static InlineCost getAlways(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:125
static InlineCost get(int Cost, int Threshold, int StaticBonus=0)
Definition: InlineCost.h:119
This class captures the data input to the InlineFunction call, and records the auxiliary results prod...
Definition: Cloning.h:255
InlineResult is basically true or false.
Definition: InlineCost.h:179
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:567
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
bool extractProfTotalWeight(uint64_t &TotalVal) const
Retrieve total raw weight values of a branch.
Definition: Metadata.cpp:1788
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
A smart pointer to a reference-counted object that inherits from RefCountedBase or ThreadSafeRefCount...
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
An analysis pass which computes the call graph for a module.
A lazily constructed view of the call graph of a module.
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
VectorType::iterator erase(typename VectorType::iterator Iterator)
Remove the element given by Iterator.
Definition: MapVector.h:193
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Diagnostic information for optimization analysis remarks.
Diagnostic information for applied optimization remarks.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
void refresh()
If no summary is present, attempt to refresh.
bool isHotCountNthPercentile(int PercentileCutoff, uint64_t C) const
Returns true if count C is considered hot with regard to a given hot percentile cutoff value.
Sample profile inference pass.
void computeDominanceAndLoopInfo(FunctionT &F)
virtual ErrorOr< uint64_t > getInstWeight(const InstructionT &Inst)
Get the weight for an instruction.
virtual const FunctionSamples * findFunctionSamples(const InstructionT &I) const
Get the FunctionSamples for an instruction.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
SampleProfileLoaderPass(std::string File="", std::string RemappingFile="", ThinOrFullLTOPhase LTOPhase=ThinOrFullLTOPhase::None, IntrusiveRefCntPtr< vfs::FileSystem > FS=nullptr, bool DisableSampleProfileInlining=false, bool UseFlattenedProfile=false)
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:147
StringSet - A wrapper for StringMap that provides set-like functionality.
Definition: StringSet.h:23
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
int getNumOccurrences() const
Definition: CommandLine.h:399
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
const ParentTy * getParent() const
Definition: ilist_node.h:32
This class represents a function that is read from a sample profile.
Definition: FunctionId.h:36
Representation of the samples collected for a function.
Definition: SampleProf.h:745
void findInlinedFunctions(DenseSet< GlobalValue::GUID > &S, const HashKeyMap< std::unordered_map, FunctionId, Function * > &SymbolMap, uint64_t Threshold) const
Recursively traverses all children, if the total sample count of the corresponding function is no les...
Definition: SampleProf.h:1041
FunctionId getFunction() const
Return the function name.
Definition: SampleProf.h:1074
static StringRef getCanonicalFnName(const Function &F)
Return the canonical name for a function, taking into account suffix elision policy attributes.
Definition: SampleProf.h:1090
SampleContext & getContext() const
Definition: SampleProf.h:1192
sampleprof_error merge(const FunctionSamples &Other, uint64_t Weight=1)
Merge the samples in Other into this one.
Definition: SampleProf.h:998
static LineLocation getCallSiteIdentifier(const DILocation *DIL, bool ProfileIsFS=false)
Returns a unique call site identifier for a given debug location of a call instruction.
Definition: SampleProf.cpp:221
uint64_t getHeadSamplesEstimate() const
Return an estimate of the sample count of the function entry basic block.
Definition: SampleProf.h:949
uint64_t getGUID() const
Return the GUID of the context's name.
Definition: SampleProf.h:1211
const BodySampleMap & getBodySamples() const
Return all the samples collected in the body of the function.
Definition: SampleProf.h:973
static bool UseMD5
Whether the profile uses MD5 to represent string.
Definition: SampleProf.h:1197
This class is a wrapper to associative container MapT<KeyT, ValueT> using the hash value of the origi...
Definition: HashKeyMap.h:53
static void flattenProfile(SampleProfileMap &ProfileMap, bool ProfileIsCS=false)
Definition: SampleProf.h:1424
bool hasState(ContextStateMask S)
Definition: SampleProf.h:612
bool hasAttribute(ContextAttributeMask A)
Definition: SampleProf.h:608
This class provides operator overloads to the map container using MD5 as the key type,...
Definition: SampleProf.h:1313
Sample-based profile reader.
static ErrorOr< std::unique_ptr< SampleProfileReader > > create(StringRef Filename, LLVMContext &C, vfs::FileSystem &FS, FSDiscriminatorPass P=FSDiscriminatorPass::Base, StringRef RemapFilename="")
Create a sample profile reader appropriate to the file format.
std::unordered_map< FunctionId, uint64_t > CallTargetMap
Definition: SampleProf.h:338
static const SortedCallTargetSet sortCallTargets(const CallTargetMap &Targets)
Sort call targets in descending order of call frequency.
Definition: SampleProf.h:407
static const CallTargetMap adjustCallTargets(const CallTargetMap &Targets, float DistributionFactor)
Prorate call targets by a distribution factor.
Definition: SampleProf.h:416
Enumerate the SCCs of a directed graph in reverse topological order of the SCC DAG.
Definition: SCCIterator.h:49
bool isAtEnd() const
Direct loop termination test which is more efficient than comparison with end().
Definition: SCCIterator.h:113
Sort the nodes of a directed SCC in the decreasing order of the edge weights.
Definition: SCCIterator.h:253
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ FS
Definition: X86.h:211
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
void checkExpectAnnotations(Instruction &I, const ArrayRef< uint32_t > ExistingWeights, bool IsFrontend)
checkExpectAnnotations - compares PGO counters to the thresholds used for llvm.expect and warns if th...
Definition: MisExpect.cpp:202
DenseMap< SymbolStringPtr, ExecutorSymbolDef > SymbolMap
A map from symbol names (as SymbolStringPtrs) to JITSymbols (address/flags pairs).
DiagnosticInfoOptimizationBase::Argument NV
CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
static FunctionId getRepInFormat(StringRef Name)
Get the proper representation of a string according to whether the current Format uses MD5 to represe...
Definition: SampleProf.h:1299
std::map< FunctionId, FunctionSamples > FunctionSamplesMap
Definition: SampleProf.h:735
bool callsiteIsHot(const FunctionSamples *CallsiteFS, ProfileSummaryInfo *PSI, bool ProfAccForSymsInList)
Return true if the given callsite is hot wrt to hot cutoff threshold.
IntrusiveRefCntPtr< FileSystem > getRealFileSystem()
Gets an vfs::FileSystem for the 'real' file system, as seen by the operating system.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
cl::opt< int > ProfileInlineLimitMin
bool succ_empty(const Instruction *I)
Definition: CFG.h:255
scc_iterator< T > scc_begin(const T &G)
Construct the begin iterator for a deduced graph type T.
Definition: SCCIterator.h:233
static void buildTopDownFuncOrder(LazyCallGraph &CG, std::vector< Function * > &FunctionOrderList)
void setProbeDistributionFactor(Instruction &Inst, float Factor)
Definition: PseudoProbe.cpp:76
std::string AnnotateInlinePassName(InlineContext IC)
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
Definition: Pass.h:76
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
cl::opt< bool > SampleProfileUseProfi
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:1301
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
llvm::cl::opt< bool > UseIterativeBFIInference
std::optional< PseudoProbe > extractProbe(const Instruction &Inst)
Definition: PseudoProbe.cpp:56
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void emitInlinedIntoBasedOnCost(OptimizationRemarkEmitter &ORE, DebugLoc DLoc, const BasicBlock *Block, const Function &Callee, const Function &Caller, const InlineCost &IC, bool ForProfileContext=false, const char *PassName=nullptr)
Emit ORE message based in cost (default heuristic).
SmallVector< InstrProfValueData, 4 > getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind, uint32_t MaxNumValueData, uint64_t &TotalC, bool GetNoICPValue=false)
Extract the value profile data from Inst and returns them if Inst is annotated with value profile dat...
Definition: InstrProf.cpp:1369
std::unique_ptr< InlineAdvisor > getReplayInlineAdvisor(Module &M, FunctionAnalysisManager &FAM, LLVMContext &Context, std::unique_ptr< InlineAdvisor > OriginalAdvisor, const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks, InlineContext IC)
cl::opt< int > SampleHotCallSiteThreshold
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
void updateProfileCallee(Function *Callee, int64_t EntryDelta, const ValueMap< const Value *, WeakTrackingVH > *VMap=nullptr)
Updates profile information by adjusting the entry count by adding EntryDelta then scaling callsite i...
cl::opt< int > SampleColdCallSiteThreshold
InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, bool MergeAttributes=false, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true, Function *ForwardVarArgsTo=nullptr)
This function inlines the called function into the basic block of the caller.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1873
@ DS_Warning
static bool skipProfileForFunction(const Function &F)
cl::opt< bool > SortProfiledSCC
cl::opt< int > ProfileInlineLimitMax
cl::opt< bool > EnableExtTspBlockPlacement
const uint64_t NOMORE_ICP_MAGICNUM
Magic number in the value profile metadata showing a target has been promoted for the instruction and...
Definition: Metadata.h:57
cl::opt< int > ProfileInlineGrowthLimit
constexpr const char * PseudoProbeDescMetadataName
Definition: PseudoProbe.h:25
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
Used in the streaming interface as the general argument type.
A wrapper of binary function with basic blocks and jumps.
Provides context on when an inline advisor is constructed in the pipeline (e.g., link phase,...
Definition: InlineAdvisor.h:58
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:205
std::optional< bool > AllowRecursiveCall
Indicate whether we allow inlining for recursive call.
Definition: InlineCost.h:238
std::optional< bool > ComputeFullInlineCost
Compute inline cost even when the cost has exceeded the threshold.
Definition: InlineCost.h:232
static std::optional< uint32_t > extractDwarfBaseDiscriminator(uint32_t Value)
Definition: PseudoProbe.h:80