LLVM 18.0.0git
SampleProfile.cpp
Go to the documentation of this file.
1//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the SampleProfileLoader transformation. This pass
10// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
11// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
12// profile information in the given profile.
13//
14// This pass generates branch weight annotations on the IR:
15//
16// - prof: Represents branch weights. This annotation is added to branches
17// to indicate the weights of each edge coming out of the branch.
18// The weight of each edge is the weight of the target block for
19// that edge. The weight of a block B is computed as the maximum
20// number of samples found in B.
21//
22//===----------------------------------------------------------------------===//
23
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/ADT/DenseMap.h"
27#include "llvm/ADT/DenseSet.h"
28#include "llvm/ADT/MapVector.h"
32#include "llvm/ADT/Statistic.h"
33#include "llvm/ADT/StringMap.h"
34#include "llvm/ADT/StringRef.h"
35#include "llvm/ADT/Twine.h"
46#include "llvm/IR/BasicBlock.h"
47#include "llvm/IR/DebugLoc.h"
49#include "llvm/IR/Function.h"
50#include "llvm/IR/GlobalValue.h"
51#include "llvm/IR/InstrTypes.h"
52#include "llvm/IR/Instruction.h"
55#include "llvm/IR/LLVMContext.h"
56#include "llvm/IR/MDBuilder.h"
57#include "llvm/IR/Module.h"
58#include "llvm/IR/PassManager.h"
60#include "llvm/IR/PseudoProbe.h"
67#include "llvm/Support/Debug.h"
71#include "llvm/Transforms/IPO.h"
81#include <algorithm>
82#include <cassert>
83#include <cstdint>
84#include <functional>
85#include <limits>
86#include <map>
87#include <memory>
88#include <queue>
89#include <string>
90#include <system_error>
91#include <utility>
92#include <vector>
93
94using namespace llvm;
95using namespace sampleprof;
96using namespace llvm::sampleprofutil;
98#define DEBUG_TYPE "sample-profile"
99#define CSINLINE_DEBUG DEBUG_TYPE "-inline"
100
101STATISTIC(NumCSInlined,
102 "Number of functions inlined with context sensitive profile");
103STATISTIC(NumCSNotInlined,
104 "Number of functions not inlined with context sensitive profile");
105STATISTIC(NumMismatchedProfile,
106 "Number of functions with CFG mismatched profile");
107STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
108STATISTIC(NumDuplicatedInlinesite,
109 "Number of inlined callsites with a partial distribution factor");
110
111STATISTIC(NumCSInlinedHitMinLimit,
112 "Number of functions with FDO inline stopped due to min size limit");
113STATISTIC(NumCSInlinedHitMaxLimit,
114 "Number of functions with FDO inline stopped due to max size limit");
116 NumCSInlinedHitGrowthLimit,
117 "Number of functions with FDO inline stopped due to growth size limit");
118
119// Command line option to specify the file to read samples from. This is
120// mainly used for debugging.
122 "sample-profile-file", cl::init(""), cl::value_desc("filename"),
123 cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
124
125// The named file contains a set of transformations that may have been applied
126// to the symbol names between the program from which the sample data was
127// collected and the current program's symbols.
129 "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
130 cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
131
133 "salvage-stale-profile", cl::Hidden, cl::init(false),
134 cl::desc("Salvage stale profile by fuzzy matching and use the remapped "
135 "location for sample profile query."));
136
138 "report-profile-staleness", cl::Hidden, cl::init(false),
139 cl::desc("Compute and report stale profile statistical metrics."));
140
142 "persist-profile-staleness", cl::Hidden, cl::init(false),
143 cl::desc("Compute stale profile statistical metrics and write it into the "
144 "native object file(.llvm_stats section)."));
145
147 "profile-sample-accurate", cl::Hidden, cl::init(false),
148 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
149 "callsite and function as having 0 samples. Otherwise, treat "
150 "un-sampled callsites and functions conservatively as unknown. "));
151
153 "profile-sample-block-accurate", cl::Hidden, cl::init(false),
154 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
155 "branches and calls as having 0 samples. Otherwise, treat "
156 "them conservatively as unknown. "));
157
159 "profile-accurate-for-symsinlist", cl::Hidden, cl::init(true),
160 cl::desc("For symbols in profile symbol list, regard their profiles to "
161 "be accurate. It may be overriden by profile-sample-accurate. "));
162
164 "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
165 cl::desc("Merge past inlinee's profile to outline version if sample "
166 "profile loader decided not to inline a call site. It will "
167 "only be enabled when top-down order of profile loading is "
168 "enabled. "));
169
171 "sample-profile-top-down-load", cl::Hidden, cl::init(true),
172 cl::desc("Do profile annotation and inlining for functions in top-down "
173 "order of call graph during sample profile loading. It only "
174 "works for new pass manager. "));
175
176static cl::opt<bool>
177 UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden,
178 cl::desc("Process functions in a top-down order "
179 "defined by the profiled call graph when "
180 "-sample-profile-top-down-load is on."));
181
183 "sample-profile-inline-size", cl::Hidden, cl::init(false),
184 cl::desc("Inline cold call sites in profile loader if it's beneficial "
185 "for code size."));
186
187// Since profiles are consumed by many passes, turning on this option has
188// side effects. For instance, pre-link SCC inliner would see merged profiles
189// and inline the hot functions (that are skipped in this pass).
191 "disable-sample-loader-inlining", cl::Hidden, cl::init(false),
192 cl::desc("If true, artifically skip inline transformation in sample-loader "
193 "pass, and merge (or scale) profiles (as configured by "
194 "--sample-profile-merge-inlinee)."));
195
196namespace llvm {
198 SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,
199 cl::desc("Sort profiled recursion by edge weights."));
200
202 "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
203 cl::desc("The size growth ratio limit for proirity-based sample profile "
204 "loader inlining."));
205
207 "sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
208 cl::desc("The lower bound of size growth limit for "
209 "proirity-based sample profile loader inlining."));
210
212 "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
213 cl::desc("The upper bound of size growth limit for "
214 "proirity-based sample profile loader inlining."));
215
217 "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
218 cl::desc("Hot callsite threshold for proirity-based sample profile loader "
219 "inlining."));
220
222 "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
223 cl::desc("Threshold for inlining cold callsites"));
224} // namespace llvm
225
227 "sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25),
228 cl::desc(
229 "Relative hotness percentage threshold for indirect "
230 "call promotion in proirity-based sample profile loader inlining."));
231
233 "sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1),
234 cl::desc(
235 "Skip relative hotness check for ICP up to given number of targets."));
236
238 "sample-profile-prioritized-inline", cl::Hidden,
239
240 cl::desc("Use call site prioritized inlining for sample profile loader."
241 "Currently only CSSPGO is supported."));
242
244 "sample-profile-use-preinliner", cl::Hidden,
245
246 cl::desc("Use the preinliner decisions stored in profile context."));
247
249 "sample-profile-recursive-inline", cl::Hidden,
250
251 cl::desc("Allow sample loader inliner to inline recursive calls."));
252
254 "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
255 cl::desc(
256 "Optimization remarks file containing inline remarks to be replayed "
257 "by inlining from sample profile loader."),
258 cl::Hidden);
259
261 "sample-profile-inline-replay-scope",
262 cl::init(ReplayInlinerSettings::Scope::Function),
263 cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function",
264 "Replay on functions that have remarks associated "
265 "with them (default)"),
266 clEnumValN(ReplayInlinerSettings::Scope::Module, "Module",
267 "Replay on the entire module")),
268 cl::desc("Whether inline replay should be applied to the entire "
269 "Module or just the Functions (default) that are present as "
270 "callers in remarks during sample profile inlining."),
271 cl::Hidden);
272
274 "sample-profile-inline-replay-fallback",
275 cl::init(ReplayInlinerSettings::Fallback::Original),
278 ReplayInlinerSettings::Fallback::Original, "Original",
279 "All decisions not in replay send to original advisor (default)"),
280 clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline,
281 "AlwaysInline", "All decisions not in replay are inlined"),
282 clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline",
283 "All decisions not in replay are not inlined")),
284 cl::desc("How sample profile inline replay treats sites that don't come "
285 "from the replay. Original: defers to original advisor, "
286 "AlwaysInline: inline all sites not in replay, NeverInline: "
287 "inline no sites not in replay"),
288 cl::Hidden);
289
291 "sample-profile-inline-replay-format",
292 cl::init(CallSiteFormat::Format::LineColumnDiscriminator),
294 clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"),
295 clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn",
296 "<Line Number>:<Column Number>"),
297 clEnumValN(CallSiteFormat::Format::LineDiscriminator,
298 "LineDiscriminator", "<Line Number>.<Discriminator>"),
299 clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator,
300 "LineColumnDiscriminator",
301 "<Line Number>:<Column Number>.<Discriminator> (default)")),
302 cl::desc("How sample profile inline replay file is formatted"), cl::Hidden);
303
305 MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden,
306 cl::desc("Max number of promotions for a single indirect "
307 "call callsite in sample profile loader"));
308
310 "overwrite-existing-weights", cl::Hidden, cl::init(false),
311 cl::desc("Ignore existing branch weights on IR and always overwrite."));
312
314 "annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false),
315 cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for "
316 "sample-profile inline pass name."));
317
318namespace llvm {
320}
321
322namespace {
323
324using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
325using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
326using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
327using EdgeWeightMap = DenseMap<Edge, uint64_t>;
328using BlockEdgeMap =
330
331class GUIDToFuncNameMapper {
332public:
333 GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
334 DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
335 : CurrentReader(Reader), CurrentModule(M),
336 CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
337 if (!CurrentReader.useMD5())
338 return;
339
340 for (const auto &F : CurrentModule) {
341 StringRef OrigName = F.getName();
342 CurrentGUIDToFuncNameMap.insert(
343 {Function::getGUID(OrigName), OrigName});
344
345 // Local to global var promotion used by optimization like thinlto
346 // will rename the var and add suffix like ".llvm.xxx" to the
347 // original local name. In sample profile, the suffixes of function
348 // names are all stripped. Since it is possible that the mapper is
349 // built in post-thin-link phase and var promotion has been done,
350 // we need to add the substring of function name without the suffix
351 // into the GUIDToFuncNameMap.
353 if (CanonName != OrigName)
354 CurrentGUIDToFuncNameMap.insert(
355 {Function::getGUID(CanonName), CanonName});
356 }
357
358 // Update GUIDToFuncNameMap for each function including inlinees.
359 SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
360 }
361
362 ~GUIDToFuncNameMapper() {
363 if (!CurrentReader.useMD5())
364 return;
365
366 CurrentGUIDToFuncNameMap.clear();
367
368 // Reset GUIDToFuncNameMap for of each function as they're no
369 // longer valid at this point.
370 SetGUIDToFuncNameMapForAll(nullptr);
371 }
372
373private:
374 void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
375 std::queue<FunctionSamples *> FSToUpdate;
376 for (auto &IFS : CurrentReader.getProfiles()) {
377 FSToUpdate.push(&IFS.second);
378 }
379
380 while (!FSToUpdate.empty()) {
381 FunctionSamples *FS = FSToUpdate.front();
382 FSToUpdate.pop();
383 FS->GUIDToFuncNameMap = Map;
384 for (const auto &ICS : FS->getCallsiteSamples()) {
385 const FunctionSamplesMap &FSMap = ICS.second;
386 for (const auto &IFS : FSMap) {
387 FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
388 FSToUpdate.push(&FS);
389 }
390 }
391 }
392 }
393
395 Module &CurrentModule;
396 DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
397};
398
399// Inline candidate used by iterative callsite prioritized inliner
400struct InlineCandidate {
401 CallBase *CallInstr;
402 const FunctionSamples *CalleeSamples;
403 // Prorated callsite count, which will be used to guide inlining. For example,
404 // if a callsite is duplicated in LTO prelink, then in LTO postlink the two
405 // copies will get their own distribution factors and their prorated counts
406 // will be used to decide if they should be inlined independently.
407 uint64_t CallsiteCount;
408 // Call site distribution factor to prorate the profile samples for a
409 // duplicated callsite. Default value is 1.0.
410 float CallsiteDistribution;
411};
412
413// Inline candidate comparer using call site weight
414struct CandidateComparer {
415 bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {
416 if (LHS.CallsiteCount != RHS.CallsiteCount)
417 return LHS.CallsiteCount < RHS.CallsiteCount;
418
419 const FunctionSamples *LCS = LHS.CalleeSamples;
420 const FunctionSamples *RCS = RHS.CalleeSamples;
421 assert(LCS && RCS && "Expect non-null FunctionSamples");
422
423 // Tie breaker using number of samples try to favor smaller functions first
424 if (LCS->getBodySamples().size() != RCS->getBodySamples().size())
425 return LCS->getBodySamples().size() > RCS->getBodySamples().size();
426
427 // Tie breaker using GUID so we have stable/deterministic inlining order
428 return LCS->getGUID() < RCS->getGUID();
429 }
430};
431
432using CandidateQueue =
434 CandidateComparer>;
435
436// Sample profile matching - fuzzy match.
437class SampleProfileMatcher {
438 Module &M;
439 SampleProfileReader &Reader;
440 const PseudoProbeManager *ProbeManager;
441 SampleProfileMap FlattenedProfiles;
442 // For each function, the matcher generates a map, of which each entry is a
443 // mapping from the source location of current build to the source location in
444 // the profile.
445 StringMap<LocToLocMap> FuncMappings;
446
447 // Profile mismatching statstics.
448 uint64_t TotalProfiledCallsites = 0;
449 uint64_t NumMismatchedCallsites = 0;
450 uint64_t MismatchedCallsiteSamples = 0;
451 uint64_t TotalCallsiteSamples = 0;
452 uint64_t TotalProfiledFunc = 0;
453 uint64_t NumMismatchedFuncHash = 0;
454 uint64_t MismatchedFuncHashSamples = 0;
455 uint64_t TotalFuncHashSamples = 0;
456
457 // A dummy name for unknown indirect callee, used to differentiate from a
458 // non-call instruction that also has an empty callee name.
459 static constexpr const char *UnknownIndirectCallee =
460 "unknown.indirect.callee";
461
462public:
463 SampleProfileMatcher(Module &M, SampleProfileReader &Reader,
464 const PseudoProbeManager *ProbeManager)
465 : M(M), Reader(Reader), ProbeManager(ProbeManager){};
466 void runOnModule();
467
468private:
469 FunctionSamples *getFlattenedSamplesFor(const Function &F) {
471 auto It = FlattenedProfiles.find(FunctionId(CanonFName));
472 if (It != FlattenedProfiles.end())
473 return &It->second;
474 return nullptr;
475 }
476 void runOnFunction(const Function &F);
477 void findIRAnchors(const Function &F,
478 std::map<LineLocation, StringRef> &IRAnchors);
479 void findProfileAnchors(
480 const FunctionSamples &FS,
481 std::map<LineLocation, std::unordered_set<FunctionId>>
482 &ProfileAnchors);
483 void countMismatchedSamples(const FunctionSamples &FS);
484 void countProfileMismatches(
485 const Function &F, const FunctionSamples &FS,
486 const std::map<LineLocation, StringRef> &IRAnchors,
487 const std::map<LineLocation, std::unordered_set<FunctionId>>
488 &ProfileAnchors);
489 void countProfileCallsiteMismatches(
490 const FunctionSamples &FS,
491 const std::map<LineLocation, StringRef> &IRAnchors,
492 const std::map<LineLocation, std::unordered_set<FunctionId>>
493 &ProfileAnchors,
494 uint64_t &FuncMismatchedCallsites, uint64_t &FuncProfiledCallsites);
495 LocToLocMap &getIRToProfileLocationMap(const Function &F) {
496 auto Ret = FuncMappings.try_emplace(
498 return Ret.first->second;
499 }
500 void distributeIRToProfileLocationMap();
501 void distributeIRToProfileLocationMap(FunctionSamples &FS);
502 void runStaleProfileMatching(
503 const Function &F, const std::map<LineLocation, StringRef> &IRAnchors,
504 const std::map<LineLocation, std::unordered_set<FunctionId>>
505 &ProfileAnchors,
506 LocToLocMap &IRToProfileLocationMap);
507};
508
509/// Sample profile pass.
510///
511/// This pass reads profile data from the file specified by
512/// -sample-profile-file and annotates every affected function with the
513/// profile information found in that file.
514class SampleProfileLoader final : public SampleProfileLoaderBaseImpl<Function> {
515public:
516 SampleProfileLoader(
517 StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase,
519 std::function<AssumptionCache &(Function &)> GetAssumptionCache,
520 std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
521 std::function<const TargetLibraryInfo &(Function &)> GetTLI)
523 std::move(FS)),
524 GetAC(std::move(GetAssumptionCache)),
525 GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
526 LTOPhase(LTOPhase),
527 AnnotatedPassName(AnnotateSampleProfileInlinePhase
530 : CSINLINE_DEBUG) {}
531
532 bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
533 bool runOnModule(Module &M, ModuleAnalysisManager *AM,
535
536protected:
538 bool emitAnnotations(Function &F);
540 const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
541 const FunctionSamples *
542 findFunctionSamples(const Instruction &I) const override;
543 std::vector<const FunctionSamples *>
544 findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
545 void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples,
546 DenseSet<GlobalValue::GUID> &InlinedGUIDs,
547 uint64_t Threshold);
548 // Attempt to promote indirect call and also inline the promoted call
549 bool tryPromoteAndInlineCandidate(
550 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
551 uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
552
553 bool inlineHotFunctions(Function &F,
554 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
555 std::optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB);
556 bool getExternalInlineAdvisorShouldInline(CallBase &CB);
557 InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
558 bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
559 bool
560 tryInlineCandidate(InlineCandidate &Candidate,
561 SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
562 bool
563 inlineHotFunctionsWithPriority(Function &F,
564 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
565 // Inline cold/small functions in addition to hot ones
566 bool shouldInlineColdCallee(CallBase &CallInst);
567 void emitOptimizationRemarksForInlineCandidates(
568 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
569 bool Hot);
570 void promoteMergeNotInlinedContextSamples(
572 const Function &F);
573 std::vector<Function *> buildFunctionOrder(Module &M, LazyCallGraph &CG);
574 std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(Module &M);
575 void generateMDProfMetadata(Function &F);
576
577 /// Map from function name to Function *. Used to find the function from
578 /// the function name. If the function name contains suffix, additional
579 /// entry is added to map from the stripped name to the function if there
580 /// is one-to-one mapping.
582
583 std::function<AssumptionCache &(Function &)> GetAC;
584 std::function<TargetTransformInfo &(Function &)> GetTTI;
585 std::function<const TargetLibraryInfo &(Function &)> GetTLI;
586
587 /// Profile tracker for different context.
588 std::unique_ptr<SampleContextTracker> ContextTracker;
589
590 /// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
591 ///
592 /// We need to know the LTO phase because for example in ThinLTOPrelink
593 /// phase, in annotation, we should not promote indirect calls. Instead,
594 /// we will mark GUIDs that needs to be annotated to the function.
595 const ThinOrFullLTOPhase LTOPhase;
596 const std::string AnnotatedPassName;
597
598 /// Profle Symbol list tells whether a function name appears in the binary
599 /// used to generate the current profile.
600 std::unique_ptr<ProfileSymbolList> PSL;
601
602 /// Total number of samples collected in this profile.
603 ///
604 /// This is the sum of all the samples collected in all the functions executed
605 /// at runtime.
606 uint64_t TotalCollectedSamples = 0;
607
608 // Information recorded when we declined to inline a call site
609 // because we have determined it is too cold is accumulated for
610 // each callee function. Initially this is just the entry count.
611 struct NotInlinedProfileInfo {
612 uint64_t entryCount;
613 };
615
616 // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
617 // all the function symbols defined or declared in current module.
618 DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
619
620 // All the Names used in FunctionSamples including outline function
621 // names, inline instance names and call target names.
622 StringSet<> NamesInProfile;
623 // MD5 version of NamesInProfile. Either NamesInProfile or GUIDsInProfile is
624 // populated, depends on whether the profile uses MD5. Because the name table
625 // generally contains several magnitude more entries than the number of
626 // functions, we do not want to convert all names from one form to another.
627 llvm::DenseSet<uint64_t> GUIDsInProfile;
628
629 // For symbol in profile symbol list, whether to regard their profiles
630 // to be accurate. It is mainly decided by existance of profile symbol
631 // list and -profile-accurate-for-symsinlist flag, but it can be
632 // overriden by -profile-sample-accurate or profile-sample-accurate
633 // attribute.
634 bool ProfAccForSymsInList;
635
636 // External inline advisor used to replay inline decision from remarks.
637 std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor;
638
639 // A helper to implement the sample profile matching algorithm.
640 std::unique_ptr<SampleProfileMatcher> MatchingManager;
641
642private:
643 const char *getAnnotatedRemarkPassName() const {
644 return AnnotatedPassName.c_str();
645 }
646};
647} // end anonymous namespace
648
649namespace llvm {
650template <>
652 return succ_empty(BB);
653}
654
655template <>
657 const std::vector<const BasicBlockT *> &BasicBlocks,
658 BlockEdgeMap &Successors, FlowFunction &Func) {
659 for (auto &Jump : Func.Jumps) {
660 const auto *BB = BasicBlocks[Jump.Source];
661 const auto *Succ = BasicBlocks[Jump.Target];
662 const Instruction *TI = BB->getTerminator();
663 // Check if a block ends with InvokeInst and mark non-taken branch unlikely.
664 // In that case block Succ should be a landing pad
665 if (Successors[BB].size() == 2 && Successors[BB].back() == Succ) {
666 if (isa<InvokeInst>(TI)) {
667 Jump.IsUnlikely = true;
668 }
669 }
670 const Instruction *SuccTI = Succ->getTerminator();
671 // Check if the target block contains UnreachableInst and mark it unlikely
672 if (SuccTI->getNumSuccessors() == 0) {
673 if (isa<UnreachableInst>(SuccTI)) {
674 Jump.IsUnlikely = true;
675 }
676 }
677 }
678}
679
680template <>
682 Function &F) {
683 DT.reset(new DominatorTree);
684 DT->recalculate(F);
685
686 PDT.reset(new PostDominatorTree(F));
687
688 LI.reset(new LoopInfo);
689 LI->analyze(*DT);
690}
691} // namespace llvm
692
693ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
695 return getProbeWeight(Inst);
696
697 const DebugLoc &DLoc = Inst.getDebugLoc();
698 if (!DLoc)
699 return std::error_code();
700
701 // Ignore all intrinsics, phinodes and branch instructions.
702 // Branch and phinodes instruction usually contains debug info from sources
703 // outside of the residing basic block, thus we ignore them during annotation.
704 if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
705 return std::error_code();
706
707 // For non-CS profile, if a direct call/invoke instruction is inlined in
708 // profile (findCalleeFunctionSamples returns non-empty result), but not
709 // inlined here, it means that the inlined callsite has no sample, thus the
710 // call instruction should have 0 count.
711 // For CS profile, the callsite count of previously inlined callees is
712 // populated with the entry count of the callees.
714 if (const auto *CB = dyn_cast<CallBase>(&Inst))
715 if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
716 return 0;
717
718 return getInstWeightImpl(Inst);
719}
720
721/// Get the FunctionSamples for a call instruction.
722///
723/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
724/// instance in which that call instruction is calling to. It contains
725/// all samples that resides in the inlined instance. We first find the
726/// inlined instance in which the call instruction is from, then we
727/// traverse its children to find the callsite with the matching
728/// location.
729///
730/// \param Inst Call/Invoke instruction to query.
731///
732/// \returns The FunctionSamples pointer to the inlined instance.
733const FunctionSamples *
734SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
735 const DILocation *DIL = Inst.getDebugLoc();
736 if (!DIL) {
737 return nullptr;
738 }
739
740 StringRef CalleeName;
741 if (Function *Callee = Inst.getCalledFunction())
742 CalleeName = Callee->getName();
743
745 return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
746
747 const FunctionSamples *FS = findFunctionSamples(Inst);
748 if (FS == nullptr)
749 return nullptr;
750
751 return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL),
752 CalleeName, Reader->getRemapper());
753}
754
755/// Returns a vector of FunctionSamples that are the indirect call targets
756/// of \p Inst. The vector is sorted by the total number of samples. Stores
757/// the total call count of the indirect call in \p Sum.
758std::vector<const FunctionSamples *>
759SampleProfileLoader::findIndirectCallFunctionSamples(
760 const Instruction &Inst, uint64_t &Sum) const {
761 const DILocation *DIL = Inst.getDebugLoc();
762 std::vector<const FunctionSamples *> R;
763
764 if (!DIL) {
765 return R;
766 }
767
768 auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
769 assert(L && R && "Expect non-null FunctionSamples");
770 if (L->getHeadSamplesEstimate() != R->getHeadSamplesEstimate())
771 return L->getHeadSamplesEstimate() > R->getHeadSamplesEstimate();
772 return L->getGUID() < R->getGUID();
773 };
774
776 auto CalleeSamples =
777 ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
778 if (CalleeSamples.empty())
779 return R;
780
781 // For CSSPGO, we only use target context profile's entry count
782 // as that already includes both inlined callee and non-inlined ones..
783 Sum = 0;
784 for (const auto *const FS : CalleeSamples) {
785 Sum += FS->getHeadSamplesEstimate();
786 R.push_back(FS);
787 }
788 llvm::sort(R, FSCompare);
789 return R;
790 }
791
792 const FunctionSamples *FS = findFunctionSamples(Inst);
793 if (FS == nullptr)
794 return R;
795
797 auto T = FS->findCallTargetMapAt(CallSite);
798 Sum = 0;
799 if (T)
800 for (const auto &T_C : T.get())
801 Sum += T_C.second;
802 if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
803 if (M->empty())
804 return R;
805 for (const auto &NameFS : *M) {
806 Sum += NameFS.second.getHeadSamplesEstimate();
807 R.push_back(&NameFS.second);
808 }
809 llvm::sort(R, FSCompare);
810 }
811 return R;
812}
813
814const FunctionSamples *
815SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
817 std::optional<PseudoProbe> Probe = extractProbe(Inst);
818 if (!Probe)
819 return nullptr;
820 }
821
822 const DILocation *DIL = Inst.getDebugLoc();
823 if (!DIL)
824 return Samples;
825
826 auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
827 if (it.second) {
829 it.first->second = ContextTracker->getContextSamplesFor(DIL);
830 else
831 it.first->second =
832 Samples->findFunctionSamples(DIL, Reader->getRemapper());
833 }
834 return it.first->second;
835}
836
837/// Check whether the indirect call promotion history of \p Inst allows
838/// the promotion for \p Candidate.
839/// If the profile count for the promotion candidate \p Candidate is
840/// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted
841/// for \p Inst. If we already have at least MaxNumPromotions
842/// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we
843/// cannot promote for \p Inst anymore.
844static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) {
845 uint32_t NumVals = 0;
846 uint64_t TotalCount = 0;
847 std::unique_ptr<InstrProfValueData[]> ValueData =
848 std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
849 bool Valid =
850 getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
851 ValueData.get(), NumVals, TotalCount, true);
852 // No valid value profile so no promoted targets have been recorded
853 // before. Ok to do ICP.
854 if (!Valid)
855 return true;
856
857 unsigned NumPromoted = 0;
858 for (uint32_t I = 0; I < NumVals; I++) {
859 if (ValueData[I].Count != NOMORE_ICP_MAGICNUM)
860 continue;
861
862 // If the promotion candidate has NOMORE_ICP_MAGICNUM count in the
863 // metadata, it means the candidate has been promoted for this
864 // indirect call.
865 if (ValueData[I].Value == Function::getGUID(Candidate))
866 return false;
867 NumPromoted++;
868 // If already have MaxNumPromotions promotion, don't do it anymore.
869 if (NumPromoted == MaxNumPromotions)
870 return false;
871 }
872 return true;
873}
874
875/// Update indirect call target profile metadata for \p Inst.
876/// Usually \p Sum is the sum of counts of all the targets for \p Inst.
877/// If it is 0, it means updateIDTMetaData is used to mark a
878/// certain target to be promoted already. If it is not zero,
879/// we expect to use it to update the total count in the value profile.
880static void
882 const SmallVectorImpl<InstrProfValueData> &CallTargets,
883 uint64_t Sum) {
884 // Bail out early if MaxNumPromotions is zero.
885 // This prevents allocating an array of zero length below.
886 //
887 // Note `updateIDTMetaData` is called in two places so check
888 // `MaxNumPromotions` inside it.
889 if (MaxNumPromotions == 0)
890 return;
891 uint32_t NumVals = 0;
892 // OldSum is the existing total count in the value profile data.
893 uint64_t OldSum = 0;
894 std::unique_ptr<InstrProfValueData[]> ValueData =
895 std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
896 bool Valid =
897 getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
898 ValueData.get(), NumVals, OldSum, true);
899
900 DenseMap<uint64_t, uint64_t> ValueCountMap;
901 if (Sum == 0) {
902 assert((CallTargets.size() == 1 &&
903 CallTargets[0].Count == NOMORE_ICP_MAGICNUM) &&
904 "If sum is 0, assume only one element in CallTargets "
905 "with count being NOMORE_ICP_MAGICNUM");
906 // Initialize ValueCountMap with existing value profile data.
907 if (Valid) {
908 for (uint32_t I = 0; I < NumVals; I++)
909 ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
910 }
911 auto Pair =
912 ValueCountMap.try_emplace(CallTargets[0].Value, CallTargets[0].Count);
913 // If the target already exists in value profile, decrease the total
914 // count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM.
915 if (!Pair.second) {
916 OldSum -= Pair.first->second;
917 Pair.first->second = NOMORE_ICP_MAGICNUM;
918 }
919 Sum = OldSum;
920 } else {
921 // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM
922 // counts in the value profile.
923 if (Valid) {
924 for (uint32_t I = 0; I < NumVals; I++) {
925 if (ValueData[I].Count == NOMORE_ICP_MAGICNUM)
926 ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
927 }
928 }
929
930 for (const auto &Data : CallTargets) {
931 auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count);
932 if (Pair.second)
933 continue;
934 // The target represented by Data.Value has already been promoted.
935 // Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease
936 // Sum by Data.Count.
937 assert(Sum >= Data.Count && "Sum should never be less than Data.Count");
938 Sum -= Data.Count;
939 }
940 }
941
943 for (const auto &ValueCount : ValueCountMap) {
944 NewCallTargets.emplace_back(
945 InstrProfValueData{ValueCount.first, ValueCount.second});
946 }
947
948 llvm::sort(NewCallTargets,
949 [](const InstrProfValueData &L, const InstrProfValueData &R) {
950 if (L.Count != R.Count)
951 return L.Count > R.Count;
952 return L.Value > R.Value;
953 });
954
955 uint32_t MaxMDCount =
956 std::min(NewCallTargets.size(), static_cast<size_t>(MaxNumPromotions));
958 NewCallTargets, Sum, IPVK_IndirectCallTarget, MaxMDCount);
959}
960
961/// Attempt to promote indirect call and also inline the promoted call.
962///
963/// \param F Caller function.
964/// \param Candidate ICP and inline candidate.
965/// \param SumOrigin Original sum of target counts for indirect call before
966/// promoting given candidate.
967/// \param Sum Prorated sum of remaining target counts for indirect call
968/// after promoting given candidate.
969/// \param InlinedCallSite Output vector for new call sites exposed after
970/// inlining.
971bool SampleProfileLoader::tryPromoteAndInlineCandidate(
972 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
973 SmallVector<CallBase *, 8> *InlinedCallSite) {
974 // Bail out early if sample-loader inliner is disabled.
976 return false;
977
978 // Bail out early if MaxNumPromotions is zero.
979 // This prevents allocating an array of zero length in callees below.
980 if (MaxNumPromotions == 0)
981 return false;
982 auto CalleeFunctionName = Candidate.CalleeSamples->getFunction();
983 auto R = SymbolMap.find(CalleeFunctionName);
984 if (R == SymbolMap.end() || !R->second)
985 return false;
986
987 auto &CI = *Candidate.CallInstr;
988 if (!doesHistoryAllowICP(CI, R->second->getName()))
989 return false;
990
991 const char *Reason = "Callee function not available";
992 // R->getValue() != &F is to prevent promoting a recursive call.
993 // If it is a recursive call, we do not inline it as it could bloat
994 // the code exponentially. There is way to better handle this, e.g.
995 // clone the caller first, and inline the cloned caller if it is
996 // recursive. As llvm does not inline recursive calls, we will
997 // simply ignore it instead of handling it explicitly.
998 if (!R->second->isDeclaration() && R->second->getSubprogram() &&
999 R->second->hasFnAttribute("use-sample-profile") &&
1000 R->second != &F && isLegalToPromote(CI, R->second, &Reason)) {
1001 // For promoted target, set its value with NOMORE_ICP_MAGICNUM count
1002 // in the value profile metadata so the target won't be promoted again.
1003 SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{
1004 Function::getGUID(R->second->getName()), NOMORE_ICP_MAGICNUM}};
1005 updateIDTMetaData(CI, SortedCallTargets, 0);
1006
1007 auto *DI = &pgo::promoteIndirectCall(
1008 CI, R->second, Candidate.CallsiteCount, Sum, false, ORE);
1009 if (DI) {
1010 Sum -= Candidate.CallsiteCount;
1011 // Do not prorate the indirect callsite distribution since the original
1012 // distribution will be used to scale down non-promoted profile target
1013 // counts later. By doing this we lose track of the real callsite count
1014 // for the leftover indirect callsite as a trade off for accurate call
1015 // target counts.
1016 // TODO: Ideally we would have two separate factors, one for call site
1017 // counts and one is used to prorate call target counts.
1018 // Do not update the promoted direct callsite distribution at this
1019 // point since the original distribution combined with the callee profile
1020 // will be used to prorate callsites from the callee if inlined. Once not
1021 // inlined, the direct callsite distribution should be prorated so that
1022 // the it will reflect the real callsite counts.
1023 Candidate.CallInstr = DI;
1024 if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {
1025 bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);
1026 if (!Inlined) {
1027 // Prorate the direct callsite distribution so that it reflects real
1028 // callsite counts.
1030 *DI, static_cast<float>(Candidate.CallsiteCount) / SumOrigin);
1031 }
1032 return Inlined;
1033 }
1034 }
1035 } else {
1036 LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
1038 Candidate.CallInstr->getName())<< " because "
1039 << Reason << "\n");
1040 }
1041 return false;
1042}
1043
1044bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
1045 if (!ProfileSizeInline)
1046 return false;
1047
1049 if (Callee == nullptr)
1050 return false;
1051
1053 GetAC, GetTLI);
1054
1055 if (Cost.isNever())
1056 return false;
1057
1058 if (Cost.isAlways())
1059 return true;
1060
1061 return Cost.getCost() <= SampleColdCallSiteThreshold;
1062}
1063
1064void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
1065 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
1066 bool Hot) {
1067 for (auto *I : Candidates) {
1068 Function *CalledFunction = I->getCalledFunction();
1069 if (CalledFunction) {
1070 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1071 "InlineAttempt", I->getDebugLoc(),
1072 I->getParent())
1073 << "previous inlining reattempted for "
1074 << (Hot ? "hotness: '" : "size: '")
1075 << ore::NV("Callee", CalledFunction) << "' into '"
1076 << ore::NV("Caller", &F) << "'");
1077 }
1078 }
1079}
1080
1081void SampleProfileLoader::findExternalInlineCandidate(
1082 CallBase *CB, const FunctionSamples *Samples,
1083 DenseSet<GlobalValue::GUID> &InlinedGUIDs, uint64_t Threshold) {
1084
1085 // If ExternalInlineAdvisor(ReplayInlineAdvisor) wants to inline an external
1086 // function make sure it's imported
1087 if (CB && getExternalInlineAdvisorShouldInline(*CB)) {
1088 // Samples may not exist for replayed function, if so
1089 // just add the direct GUID and move on
1090 if (!Samples) {
1091 InlinedGUIDs.insert(
1092 Function::getGUID(CB->getCalledFunction()->getName()));
1093 return;
1094 }
1095 // Otherwise, drop the threshold to import everything that we can
1096 Threshold = 0;
1097 }
1098
1099 // In some rare cases, call instruction could be changed after being pushed
1100 // into inline candidate queue, this is because earlier inlining may expose
1101 // constant propagation which can change indirect call to direct call. When
1102 // this happens, we may fail to find matching function samples for the
1103 // candidate later, even if a match was found when the candidate was enqueued.
1104 if (!Samples)
1105 return;
1106
1107 // For AutoFDO profile, retrieve candidate profiles by walking over
1108 // the nested inlinee profiles.
1110 Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);
1111 return;
1112 }
1113
1114 ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(Samples);
1115 std::queue<ContextTrieNode *> CalleeList;
1116 CalleeList.push(Caller);
1117 while (!CalleeList.empty()) {
1118 ContextTrieNode *Node = CalleeList.front();
1119 CalleeList.pop();
1120 FunctionSamples *CalleeSample = Node->getFunctionSamples();
1121 // For CSSPGO profile, retrieve candidate profile by walking over the
1122 // trie built for context profile. Note that also take call targets
1123 // even if callee doesn't have a corresponding context profile.
1124 if (!CalleeSample)
1125 continue;
1126
1127 // If pre-inliner decision is used, honor that for importing as well.
1128 bool PreInline =
1131 if (!PreInline && CalleeSample->getHeadSamplesEstimate() < Threshold)
1132 continue;
1133
1134 Function *Func = SymbolMap.lookup(CalleeSample->getFunction());
1135 // Add to the import list only when it's defined out of module.
1136 if (!Func || Func->isDeclaration())
1137 InlinedGUIDs.insert(CalleeSample->getGUID());
1138
1139 // Import hot CallTargets, which may not be available in IR because full
1140 // profile annotation cannot be done until backend compilation in ThinLTO.
1141 for (const auto &BS : CalleeSample->getBodySamples())
1142 for (const auto &TS : BS.second.getCallTargets())
1143 if (TS.second > Threshold) {
1144 const Function *Callee = SymbolMap.lookup(TS.first);
1145 if (!Callee || Callee->isDeclaration())
1146 InlinedGUIDs.insert(TS.first.getHashCode());
1147 }
1148
1149 // Import hot child context profile associted with callees. Note that this
1150 // may have some overlap with the call target loop above, but doing this
1151 // based child context profile again effectively allow us to use the max of
1152 // entry count and call target count to determine importing.
1153 for (auto &Child : Node->getAllChildContext()) {
1154 ContextTrieNode *CalleeNode = &Child.second;
1155 CalleeList.push(CalleeNode);
1156 }
1157 }
1158}
1159
1160/// Iteratively inline hot callsites of a function.
1161///
1162/// Iteratively traverse all callsites of the function \p F, so as to
1163/// find out callsites with corresponding inline instances.
1164///
1165/// For such callsites,
1166/// - If it is hot enough, inline the callsites and adds callsites of the callee
1167/// into the caller. If the call is an indirect call, first promote
1168/// it to direct call. Each indirect call is limited with a single target.
1169///
1170/// - If a callsite is not inlined, merge the its profile to the outline
1171/// version (if --sample-profile-merge-inlinee is true), or scale the
1172/// counters of standalone function based on the profile of inlined
1173/// instances (if --sample-profile-merge-inlinee is false).
1174///
1175/// Later passes may consume the updated profiles.
1176///
1177/// \param F function to perform iterative inlining.
1178/// \param InlinedGUIDs a set to be updated to include all GUIDs that are
1179/// inlined in the profiled binary.
1180///
1181/// \returns True if there is any inline happened.
1182bool SampleProfileLoader::inlineHotFunctions(
1183 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1184 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1185 // Profile symbol list is ignored when profile-sample-accurate is on.
1186 assert((!ProfAccForSymsInList ||
1188 !F.hasFnAttribute("profile-sample-accurate"))) &&
1189 "ProfAccForSymsInList should be false when profile-sample-accurate "
1190 "is enabled");
1191
1192 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1193 bool Changed = false;
1194 bool LocalChanged = true;
1195 while (LocalChanged) {
1196 LocalChanged = false;
1198 for (auto &BB : F) {
1199 bool Hot = false;
1200 SmallVector<CallBase *, 10> AllCandidates;
1201 SmallVector<CallBase *, 10> ColdCandidates;
1202 for (auto &I : BB) {
1203 const FunctionSamples *FS = nullptr;
1204 if (auto *CB = dyn_cast<CallBase>(&I)) {
1205 if (!isa<IntrinsicInst>(I)) {
1206 if ((FS = findCalleeFunctionSamples(*CB))) {
1207 assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
1208 "GUIDToFuncNameMap has to be populated");
1209 AllCandidates.push_back(CB);
1210 if (FS->getHeadSamplesEstimate() > 0 ||
1212 LocalNotInlinedCallSites.insert({CB, FS});
1213 if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1214 Hot = true;
1215 else if (shouldInlineColdCallee(*CB))
1216 ColdCandidates.push_back(CB);
1217 } else if (getExternalInlineAdvisorShouldInline(*CB)) {
1218 AllCandidates.push_back(CB);
1219 }
1220 }
1221 }
1222 }
1223 if (Hot || ExternalInlineAdvisor) {
1224 CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
1225 emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
1226 } else {
1227 CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
1228 emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
1229 }
1230 }
1231 for (CallBase *I : CIS) {
1232 Function *CalledFunction = I->getCalledFunction();
1233 InlineCandidate Candidate = {I, LocalNotInlinedCallSites.lookup(I),
1234 0 /* dummy count */,
1235 1.0 /* dummy distribution factor */};
1236 // Do not inline recursive calls.
1237 if (CalledFunction == &F)
1238 continue;
1239 if (I->isIndirectCall()) {
1240 uint64_t Sum;
1241 for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
1242 uint64_t SumOrigin = Sum;
1243 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1244 findExternalInlineCandidate(I, FS, InlinedGUIDs,
1245 PSI->getOrCompHotCountThreshold());
1246 continue;
1247 }
1248 if (!callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1249 continue;
1250
1251 Candidate = {I, FS, FS->getHeadSamplesEstimate(), 1.0};
1252 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) {
1253 LocalNotInlinedCallSites.erase(I);
1254 LocalChanged = true;
1255 }
1256 }
1257 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1258 !CalledFunction->isDeclaration()) {
1259 if (tryInlineCandidate(Candidate)) {
1260 LocalNotInlinedCallSites.erase(I);
1261 LocalChanged = true;
1262 }
1263 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1264 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1265 InlinedGUIDs,
1266 PSI->getOrCompHotCountThreshold());
1267 }
1268 }
1269 Changed |= LocalChanged;
1270 }
1271
1272 // For CS profile, profile for not inlined context will be merged when
1273 // base profile is being retrieved.
1275 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1276 return Changed;
1277}
1278
1279bool SampleProfileLoader::tryInlineCandidate(
1280 InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
1281 // Do not attempt to inline a candidate if
1282 // --disable-sample-loader-inlining is true.
1284 return false;
1285
1286 CallBase &CB = *Candidate.CallInstr;
1287 Function *CalledFunction = CB.getCalledFunction();
1288 assert(CalledFunction && "Expect a callee with definition");
1289 DebugLoc DLoc = CB.getDebugLoc();
1290 BasicBlock *BB = CB.getParent();
1291
1292 InlineCost Cost = shouldInlineCandidate(Candidate);
1293 if (Cost.isNever()) {
1294 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1295 "InlineFail", DLoc, BB)
1296 << "incompatible inlining");
1297 return false;
1298 }
1299
1300 if (!Cost)
1301 return false;
1302
1303 InlineFunctionInfo IFI(GetAC);
1304 IFI.UpdateProfile = false;
1305 InlineResult IR = InlineFunction(CB, IFI,
1306 /*MergeAttributes=*/true);
1307 if (!IR.isSuccess())
1308 return false;
1309
1310 // The call to InlineFunction erases I, so we can't pass it here.
1311 emitInlinedIntoBasedOnCost(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(),
1312 Cost, true, getAnnotatedRemarkPassName());
1313
1314 // Now populate the list of newly exposed call sites.
1315 if (InlinedCallSites) {
1316 InlinedCallSites->clear();
1317 for (auto &I : IFI.InlinedCallSites)
1318 InlinedCallSites->push_back(I);
1319 }
1320
1322 ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
1323 ++NumCSInlined;
1324
1325 // Prorate inlined probes for a duplicated inlining callsite which probably
1326 // has a distribution less than 100%. Samples for an inlinee should be
1327 // distributed among the copies of the original callsite based on each
1328 // callsite's distribution factor for counts accuracy. Note that an inlined
1329 // probe may come with its own distribution factor if it has been duplicated
1330 // in the inlinee body. The two factor are multiplied to reflect the
1331 // aggregation of duplication.
1332 if (Candidate.CallsiteDistribution < 1) {
1333 for (auto &I : IFI.InlinedCallSites) {
1334 if (std::optional<PseudoProbe> Probe = extractProbe(*I))
1335 setProbeDistributionFactor(*I, Probe->Factor *
1336 Candidate.CallsiteDistribution);
1337 }
1338 NumDuplicatedInlinesite++;
1339 }
1340
1341 return true;
1342}
1343
1344bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
1345 CallBase *CB) {
1346 assert(CB && "Expect non-null call instruction");
1347
1348 if (isa<IntrinsicInst>(CB))
1349 return false;
1350
1351 // Find the callee's profile. For indirect call, find hottest target profile.
1352 const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
1353 // If ExternalInlineAdvisor wants to inline this site, do so even
1354 // if Samples are not present.
1355 if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(*CB))
1356 return false;
1357
1358 float Factor = 1.0;
1359 if (std::optional<PseudoProbe> Probe = extractProbe(*CB))
1360 Factor = Probe->Factor;
1361
1362 uint64_t CallsiteCount =
1363 CalleeSamples ? CalleeSamples->getHeadSamplesEstimate() * Factor : 0;
1364 *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
1365 return true;
1366}
1367
1368std::optional<InlineCost>
1369SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) {
1370 std::unique_ptr<InlineAdvice> Advice = nullptr;
1371 if (ExternalInlineAdvisor) {
1372 Advice = ExternalInlineAdvisor->getAdvice(CB);
1373 if (Advice) {
1374 if (!Advice->isInliningRecommended()) {
1375 Advice->recordUnattemptedInlining();
1376 return InlineCost::getNever("not previously inlined");
1377 }
1378 Advice->recordInlining();
1379 return InlineCost::getAlways("previously inlined");
1380 }
1381 }
1382
1383 return {};
1384}
1385
1386bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) {
1387 std::optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB);
1388 return Cost ? !!*Cost : false;
1389}
1390
1392SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
1393 if (std::optional<InlineCost> ReplayCost =
1394 getExternalInlineAdvisorCost(*Candidate.CallInstr))
1395 return *ReplayCost;
1396 // Adjust threshold based on call site hotness, only do this for callsite
1397 // prioritized inliner because otherwise cost-benefit check is done earlier.
1398 int SampleThreshold = SampleColdCallSiteThreshold;
1400 if (Candidate.CallsiteCount > PSI->getHotCountThreshold())
1401 SampleThreshold = SampleHotCallSiteThreshold;
1402 else if (!ProfileSizeInline)
1403 return InlineCost::getNever("cold callsite");
1404 }
1405
1406 Function *Callee = Candidate.CallInstr->getCalledFunction();
1407 assert(Callee && "Expect a definition for inline candidate of direct call");
1408
1409 InlineParams Params = getInlineParams();
1410 // We will ignore the threshold from inline cost, so always get full cost.
1411 Params.ComputeFullInlineCost = true;
1413 // Checks if there is anything in the reachable portion of the callee at
1414 // this callsite that makes this inlining potentially illegal. Need to
1415 // set ComputeFullInlineCost, otherwise getInlineCost may return early
1416 // when cost exceeds threshold without checking all IRs in the callee.
1417 // The acutal cost does not matter because we only checks isNever() to
1418 // see if it is legal to inline the callsite.
1419 InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
1420 GetTTI(*Callee), GetAC, GetTLI);
1421
1422 // Honor always inline and never inline from call analyzer
1423 if (Cost.isNever() || Cost.isAlways())
1424 return Cost;
1425
1426 // With CSSPGO, the preinliner in llvm-profgen can estimate global inline
1427 // decisions based on hotness as well as accurate function byte sizes for
1428 // given context using function/inlinee sizes from previous build. It
1429 // stores the decision in profile, and also adjust/merge context profile
1430 // aiming at better context-sensitive post-inline profile quality, assuming
1431 // all inline decision estimates are going to be honored by compiler. Here
1432 // we replay that inline decision under `sample-profile-use-preinliner`.
1433 // Note that we don't need to handle negative decision from preinliner as
1434 // context profile for not inlined calls are merged by preinliner already.
1435 if (UsePreInlinerDecision && Candidate.CalleeSamples) {
1436 // Once two node are merged due to promotion, we're losing some context
1437 // so the original context-sensitive preinliner decision should be ignored
1438 // for SyntheticContext.
1439 SampleContext &Context = Candidate.CalleeSamples->getContext();
1440 if (!Context.hasState(SyntheticContext) &&
1441 Context.hasAttribute(ContextShouldBeInlined))
1442 return InlineCost::getAlways("preinliner");
1443 }
1444
1445 // For old FDO inliner, we inline the call site as long as cost is not
1446 // "Never". The cost-benefit check is done earlier.
1448 return InlineCost::get(Cost.getCost(), INT_MAX);
1449 }
1450
1451 // Otherwise only use the cost from call analyzer, but overwite threshold with
1452 // Sample PGO threshold.
1453 return InlineCost::get(Cost.getCost(), SampleThreshold);
1454}
1455
1456bool SampleProfileLoader::inlineHotFunctionsWithPriority(
1457 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1458 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1459 // Profile symbol list is ignored when profile-sample-accurate is on.
1460 assert((!ProfAccForSymsInList ||
1462 !F.hasFnAttribute("profile-sample-accurate"))) &&
1463 "ProfAccForSymsInList should be false when profile-sample-accurate "
1464 "is enabled");
1465
1466 // Populating worklist with initial call sites from root inliner, along
1467 // with call site weights.
1468 CandidateQueue CQueue;
1469 InlineCandidate NewCandidate;
1470 for (auto &BB : F) {
1471 for (auto &I : BB) {
1472 auto *CB = dyn_cast<CallBase>(&I);
1473 if (!CB)
1474 continue;
1475 if (getInlineCandidate(&NewCandidate, CB))
1476 CQueue.push(NewCandidate);
1477 }
1478 }
1479
1480 // Cap the size growth from profile guided inlining. This is needed even
1481 // though cost of each inline candidate already accounts for callee size,
1482 // because with top-down inlining, we can grow inliner size significantly
1483 // with large number of smaller inlinees each pass the cost check.
1485 "Max inline size limit should not be smaller than min inline size "
1486 "limit.");
1487 unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;
1488 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
1489 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
1490 if (ExternalInlineAdvisor)
1491 SizeLimit = std::numeric_limits<unsigned>::max();
1492
1493 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1494
1495 // Perform iterative BFS call site prioritized inlining
1496 bool Changed = false;
1497 while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
1498 InlineCandidate Candidate = CQueue.top();
1499 CQueue.pop();
1500 CallBase *I = Candidate.CallInstr;
1501 Function *CalledFunction = I->getCalledFunction();
1502
1503 if (CalledFunction == &F)
1504 continue;
1505 if (I->isIndirectCall()) {
1506 uint64_t Sum = 0;
1507 auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
1508 uint64_t SumOrigin = Sum;
1509 Sum *= Candidate.CallsiteDistribution;
1510 unsigned ICPCount = 0;
1511 for (const auto *FS : CalleeSamples) {
1512 // TODO: Consider disable pre-lTO ICP for MonoLTO as well
1513 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1514 findExternalInlineCandidate(I, FS, InlinedGUIDs,
1515 PSI->getOrCompHotCountThreshold());
1516 continue;
1517 }
1518 uint64_t EntryCountDistributed =
1519 FS->getHeadSamplesEstimate() * Candidate.CallsiteDistribution;
1520 // In addition to regular inline cost check, we also need to make sure
1521 // ICP isn't introducing excessive speculative checks even if individual
1522 // target looks beneficial to promote and inline. That means we should
1523 // only do ICP when there's a small number dominant targets.
1524 if (ICPCount >= ProfileICPRelativeHotnessSkip &&
1525 EntryCountDistributed * 100 < SumOrigin * ProfileICPRelativeHotness)
1526 break;
1527 // TODO: Fix CallAnalyzer to handle all indirect calls.
1528 // For indirect call, we don't run CallAnalyzer to get InlineCost
1529 // before actual inlining. This is because we could see two different
1530 // types from the same definition, which makes CallAnalyzer choke as
1531 // it's expecting matching parameter type on both caller and callee
1532 // side. See example from PR18962 for the triggering cases (the bug was
1533 // fixed, but we generate different types).
1534 if (!PSI->isHotCount(EntryCountDistributed))
1535 break;
1536 SmallVector<CallBase *, 8> InlinedCallSites;
1537 // Attach function profile for promoted indirect callee, and update
1538 // call site count for the promoted inline candidate too.
1539 Candidate = {I, FS, EntryCountDistributed,
1540 Candidate.CallsiteDistribution};
1541 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
1542 &InlinedCallSites)) {
1543 for (auto *CB : InlinedCallSites) {
1544 if (getInlineCandidate(&NewCandidate, CB))
1545 CQueue.emplace(NewCandidate);
1546 }
1547 ICPCount++;
1548 Changed = true;
1549 } else if (!ContextTracker) {
1550 LocalNotInlinedCallSites.insert({I, FS});
1551 }
1552 }
1553 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1554 !CalledFunction->isDeclaration()) {
1555 SmallVector<CallBase *, 8> InlinedCallSites;
1556 if (tryInlineCandidate(Candidate, &InlinedCallSites)) {
1557 for (auto *CB : InlinedCallSites) {
1558 if (getInlineCandidate(&NewCandidate, CB))
1559 CQueue.emplace(NewCandidate);
1560 }
1561 Changed = true;
1562 } else if (!ContextTracker) {
1563 LocalNotInlinedCallSites.insert({I, Candidate.CalleeSamples});
1564 }
1565 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1566 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1567 InlinedGUIDs,
1568 PSI->getOrCompHotCountThreshold());
1569 }
1570 }
1571
1572 if (!CQueue.empty()) {
1573 if (SizeLimit == (unsigned)ProfileInlineLimitMax)
1574 ++NumCSInlinedHitMaxLimit;
1575 else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
1576 ++NumCSInlinedHitMinLimit;
1577 else
1578 ++NumCSInlinedHitGrowthLimit;
1579 }
1580
1581 // For CS profile, profile for not inlined context will be merged when
1582 // base profile is being retrieved.
1584 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1585 return Changed;
1586}
1587
1588void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
1590 const Function &F) {
1591 // Accumulate not inlined callsite information into notInlinedSamples
1592 for (const auto &Pair : NonInlinedCallSites) {
1593 CallBase *I = Pair.first;
1594 Function *Callee = I->getCalledFunction();
1595 if (!Callee || Callee->isDeclaration())
1596 continue;
1597
1598 ORE->emit(
1599 OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), "NotInline",
1600 I->getDebugLoc(), I->getParent())
1601 << "previous inlining not repeated: '" << ore::NV("Callee", Callee)
1602 << "' into '" << ore::NV("Caller", &F) << "'");
1603
1604 ++NumCSNotInlined;
1605 const FunctionSamples *FS = Pair.second;
1606 if (FS->getTotalSamples() == 0 && FS->getHeadSamplesEstimate() == 0) {
1607 continue;
1608 }
1609
1610 // Do not merge a context that is already duplicated into the base profile.
1611 if (FS->getContext().hasAttribute(sampleprof::ContextDuplicatedIntoBase))
1612 continue;
1613
1614 if (ProfileMergeInlinee) {
1615 // A function call can be replicated by optimizations like callsite
1616 // splitting or jump threading and the replicates end up sharing the
1617 // sample nested callee profile instead of slicing the original
1618 // inlinee's profile. We want to do merge exactly once by filtering out
1619 // callee profiles with a non-zero head sample count.
1620 if (FS->getHeadSamples() == 0) {
1621 // Use entry samples as head samples during the merge, as inlinees
1622 // don't have head samples.
1623 const_cast<FunctionSamples *>(FS)->addHeadSamples(
1624 FS->getHeadSamplesEstimate());
1625
1626 // Note that we have to do the merge right after processing function.
1627 // This allows OutlineFS's profile to be used for annotation during
1628 // top-down processing of functions' annotation.
1629 FunctionSamples *OutlineFS = Reader->getSamplesFor(*Callee);
1630 // If outlined function does not exist in the profile, add it to a
1631 // separate map so that it does not rehash the original profile.
1632 if (!OutlineFS)
1633 OutlineFS = &OutlineFunctionSamples[
1635 OutlineFS->merge(*FS, 1);
1636 // Set outlined profile to be synthetic to not bias the inliner.
1637 OutlineFS->SetContextSynthetic();
1638 }
1639 } else {
1640 auto pair =
1641 notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
1642 pair.first->second.entryCount += FS->getHeadSamplesEstimate();
1643 }
1644 }
1645}
1646
1647/// Returns the sorted CallTargetMap \p M by count in descending order.
1651 for (const auto &I : SampleRecord::SortCallTargets(M)) {
1652 R.emplace_back(
1653 InstrProfValueData{I.first.getHashCode(), I.second});
1654 }
1655 return R;
1656}
1657
1658// Generate MD_prof metadata for every branch instruction using the
1659// edge weights computed during propagation.
1660void SampleProfileLoader::generateMDProfMetadata(Function &F) {
1661 // Generate MD_prof metadata for every branch instruction using the
1662 // edge weights computed during propagation.
1663 LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
1664 LLVMContext &Ctx = F.getContext();
1665 MDBuilder MDB(Ctx);
1666 for (auto &BI : F) {
1667 BasicBlock *BB = &BI;
1668
1669 if (BlockWeights[BB]) {
1670 for (auto &I : *BB) {
1671 if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
1672 continue;
1673 if (!cast<CallBase>(I).getCalledFunction()) {
1674 const DebugLoc &DLoc = I.getDebugLoc();
1675 if (!DLoc)
1676 continue;
1677 const DILocation *DIL = DLoc;
1678 const FunctionSamples *FS = findFunctionSamples(I);
1679 if (!FS)
1680 continue;
1682 auto T = FS->findCallTargetMapAt(CallSite);
1683 if (!T || T.get().empty())
1684 continue;
1686 // Prorate the callsite counts based on the pre-ICP distribution
1687 // factor to reflect what is already done to the callsite before
1688 // ICP, such as calliste cloning.
1689 if (std::optional<PseudoProbe> Probe = extractProbe(I)) {
1690 if (Probe->Factor < 1)
1691 T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
1692 }
1693 }
1694 SmallVector<InstrProfValueData, 2> SortedCallTargets =
1696 uint64_t Sum = 0;
1697 for (const auto &C : T.get())
1698 Sum += C.second;
1699 // With CSSPGO all indirect call targets are counted torwards the
1700 // original indirect call site in the profile, including both
1701 // inlined and non-inlined targets.
1703 if (const FunctionSamplesMap *M =
1704 FS->findFunctionSamplesMapAt(CallSite)) {
1705 for (const auto &NameFS : *M)
1706 Sum += NameFS.second.getHeadSamplesEstimate();
1707 }
1708 }
1709 if (Sum)
1710 updateIDTMetaData(I, SortedCallTargets, Sum);
1711 else if (OverwriteExistingWeights)
1712 I.setMetadata(LLVMContext::MD_prof, nullptr);
1713 } else if (!isa<IntrinsicInst>(&I)) {
1714 setBranchWeights(I, {static_cast<uint32_t>(BlockWeights[BB])});
1715 }
1716 }
1718 // Set profile metadata (possibly annotated by LTO prelink) to zero or
1719 // clear it for cold code.
1720 for (auto &I : *BB) {
1721 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1722 if (cast<CallBase>(I).isIndirectCall()) {
1723 I.setMetadata(LLVMContext::MD_prof, nullptr);
1724 } else {
1726 }
1727 }
1728 }
1729 }
1730
1731 Instruction *TI = BB->getTerminator();
1732 if (TI->getNumSuccessors() == 1)
1733 continue;
1734 if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI) &&
1735 !isa<IndirectBrInst>(TI))
1736 continue;
1737
1738 DebugLoc BranchLoc = TI->getDebugLoc();
1739 LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
1740 << ((BranchLoc) ? Twine(BranchLoc.getLine())
1741 : Twine("<UNKNOWN LOCATION>"))
1742 << ".\n");
1744 uint32_t MaxWeight = 0;
1745 Instruction *MaxDestInst;
1746 // Since profi treats multiple edges (multiway branches) as a single edge,
1747 // we need to distribute the computed weight among the branches. We do
1748 // this by evenly splitting the edge weight among destinations.
1750 std::vector<uint64_t> EdgeIndex;
1752 EdgeIndex.resize(TI->getNumSuccessors());
1753 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1754 const BasicBlock *Succ = TI->getSuccessor(I);
1755 EdgeIndex[I] = EdgeMultiplicity[Succ];
1756 EdgeMultiplicity[Succ]++;
1757 }
1758 }
1759 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1760 BasicBlock *Succ = TI->getSuccessor(I);
1761 Edge E = std::make_pair(BB, Succ);
1762 uint64_t Weight = EdgeWeights[E];
1763 LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
1764 // Use uint32_t saturated arithmetic to adjust the incoming weights,
1765 // if needed. Sample counts in profiles are 64-bit unsigned values,
1766 // but internally branch weights are expressed as 32-bit values.
1767 if (Weight > std::numeric_limits<uint32_t>::max()) {
1768 LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
1769 Weight = std::numeric_limits<uint32_t>::max();
1770 }
1771 if (!SampleProfileUseProfi) {
1772 // Weight is added by one to avoid propagation errors introduced by
1773 // 0 weights.
1774 Weights.push_back(static_cast<uint32_t>(Weight + 1));
1775 } else {
1776 // Profi creates proper weights that do not require "+1" adjustments but
1777 // we evenly split the weight among branches with the same destination.
1778 uint64_t W = Weight / EdgeMultiplicity[Succ];
1779 // Rounding up, if needed, so that first branches are hotter.
1780 if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ])
1781 W++;
1782 Weights.push_back(static_cast<uint32_t>(W));
1783 }
1784 if (Weight != 0) {
1785 if (Weight > MaxWeight) {
1786 MaxWeight = Weight;
1787 MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
1788 }
1789 }
1790 }
1791
1792 misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);
1793
1794 uint64_t TempWeight;
1795 // Only set weights if there is at least one non-zero weight.
1796 // In any other case, let the analyzer set weights.
1797 // Do not set weights if the weights are present unless under
1798 // OverwriteExistingWeights. In ThinLTO, the profile annotation is done
1799 // twice. If the first annotation already set the weights, the second pass
1800 // does not need to set it. With OverwriteExistingWeights, Blocks with zero
1801 // weight should have their existing metadata (possibly annotated by LTO
1802 // prelink) cleared.
1803 if (MaxWeight > 0 &&
1804 (!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) {
1805 LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
1806 setBranchWeights(*TI, Weights);
1807 ORE->emit([&]() {
1808 return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
1809 << "most popular destination for conditional branches at "
1810 << ore::NV("CondBranchesLoc", BranchLoc);
1811 });
1812 } else {
1814 TI->setMetadata(LLVMContext::MD_prof, nullptr);
1815 LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n");
1816 } else {
1817 LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
1818 }
1819 }
1820 }
1821}
1822
1823/// Once all the branch weights are computed, we emit the MD_prof
1824/// metadata on BB using the computed values for each of its branches.
1825///
1826/// \param F The function to query.
1827///
1828/// \returns true if \p F was modified. Returns false, otherwise.
1829bool SampleProfileLoader::emitAnnotations(Function &F) {
1830 bool Changed = false;
1831
1833 if (!ProbeManager->profileIsValid(F, *Samples)) {
1834 LLVM_DEBUG(
1835 dbgs() << "Profile is invalid due to CFG mismatch for Function "
1836 << F.getName() << "\n");
1837 ++NumMismatchedProfile;
1839 return false;
1840 }
1841 ++NumMatchedProfile;
1842 } else {
1843 if (getFunctionLoc(F) == 0)
1844 return false;
1845
1846 LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
1847 << F.getName() << ": " << getFunctionLoc(F) << "\n");
1848 }
1849
1850 DenseSet<GlobalValue::GUID> InlinedGUIDs;
1852 Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
1853 else
1854 Changed |= inlineHotFunctions(F, InlinedGUIDs);
1855
1856 Changed |= computeAndPropagateWeights(F, InlinedGUIDs);
1857
1858 if (Changed)
1859 generateMDProfMetadata(F);
1860
1861 emitCoverageRemarks(F);
1862 return Changed;
1863}
1864
1865std::unique_ptr<ProfiledCallGraph>
1866SampleProfileLoader::buildProfiledCallGraph(Module &M) {
1867 std::unique_ptr<ProfiledCallGraph> ProfiledCG;
1869 ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker);
1870 else
1871 ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles());
1872
1873 // Add all functions into the profiled call graph even if they are not in
1874 // the profile. This makes sure functions missing from the profile still
1875 // gets a chance to be processed.
1876 for (Function &F : M) {
1877 if (F.isDeclaration() || !F.hasFnAttribute("use-sample-profile"))
1878 continue;
1879 ProfiledCG->addProfiledFunction(
1881 }
1882
1883 return ProfiledCG;
1884}
1885
1886std::vector<Function *>
1887SampleProfileLoader::buildFunctionOrder(Module &M, LazyCallGraph &CG) {
1888 std::vector<Function *> FunctionOrderList;
1889 FunctionOrderList.reserve(M.size());
1890
1892 errs() << "WARNING: -use-profiled-call-graph ignored, should be used "
1893 "together with -sample-profile-top-down-load.\n";
1894
1895 if (!ProfileTopDownLoad) {
1896 if (ProfileMergeInlinee) {
1897 // Disable ProfileMergeInlinee if profile is not loaded in top down order,
1898 // because the profile for a function may be used for the profile
1899 // annotation of its outline copy before the profile merging of its
1900 // non-inlined inline instances, and that is not the way how
1901 // ProfileMergeInlinee is supposed to work.
1902 ProfileMergeInlinee = false;
1903 }
1904
1905 for (Function &F : M)
1906 if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile"))
1907 FunctionOrderList.push_back(&F);
1908 return FunctionOrderList;
1909 }
1910
1912 !UseProfiledCallGraph.getNumOccurrences())) {
1913 // Use profiled call edges to augment the top-down order. There are cases
1914 // that the top-down order computed based on the static call graph doesn't
1915 // reflect real execution order. For example
1916 //
1917 // 1. Incomplete static call graph due to unknown indirect call targets.
1918 // Adjusting the order by considering indirect call edges from the
1919 // profile can enable the inlining of indirect call targets by allowing
1920 // the caller processed before them.
1921 // 2. Mutual call edges in an SCC. The static processing order computed for
1922 // an SCC may not reflect the call contexts in the context-sensitive
1923 // profile, thus may cause potential inlining to be overlooked. The
1924 // function order in one SCC is being adjusted to a top-down order based
1925 // on the profile to favor more inlining. This is only a problem with CS
1926 // profile.
1927 // 3. Transitive indirect call edges due to inlining. When a callee function
1928 // (say B) is inlined into a caller function (say A) in LTO prelink,
1929 // every call edge originated from the callee B will be transferred to
1930 // the caller A. If any transferred edge (say A->C) is indirect, the
1931 // original profiled indirect edge B->C, even if considered, would not
1932 // enforce a top-down order from the caller A to the potential indirect
1933 // call target C in LTO postlink since the inlined callee B is gone from
1934 // the static call graph.
1935 // 4. #3 can happen even for direct call targets, due to functions defined
1936 // in header files. A header function (say A), when included into source
1937 // files, is defined multiple times but only one definition survives due
1938 // to ODR. Therefore, the LTO prelink inlining done on those dropped
1939 // definitions can be useless based on a local file scope. More
1940 // importantly, the inlinee (say B), once fully inlined to a
1941 // to-be-dropped A, will have no profile to consume when its outlined
1942 // version is compiled. This can lead to a profile-less prelink
1943 // compilation for the outlined version of B which may be called from
1944 // external modules. while this isn't easy to fix, we rely on the
1945 // postlink AutoFDO pipeline to optimize B. Since the survived copy of
1946 // the A can be inlined in its local scope in prelink, it may not exist
1947 // in the merged IR in postlink, and we'll need the profiled call edges
1948 // to enforce a top-down order for the rest of the functions.
1949 //
1950 // Considering those cases, a profiled call graph completely independent of
1951 // the static call graph is constructed based on profile data, where
1952 // function objects are not even needed to handle case #3 and case 4.
1953 //
1954 // Note that static callgraph edges are completely ignored since they
1955 // can be conflicting with profiled edges for cyclic SCCs and may result in
1956 // an SCC order incompatible with profile-defined one. Using strictly
1957 // profile order ensures a maximum inlining experience. On the other hand,
1958 // static call edges are not so important when they don't correspond to a
1959 // context in the profile.
1960
1961 std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(M);
1962 scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());
1963 while (!CGI.isAtEnd()) {
1964 auto Range = *CGI;
1965 if (SortProfiledSCC) {
1966 // Sort nodes in one SCC based on callsite hotness.
1968 Range = *SI;
1969 }
1970 for (auto *Node : Range) {
1971 Function *F = SymbolMap.lookup(Node->Name);
1972 if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
1973 FunctionOrderList.push_back(F);
1974 }
1975 ++CGI;
1976 }
1977 } else {
1978 CG.buildRefSCCs();
1979 for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) {
1980 for (LazyCallGraph::SCC &C : RC) {
1981 for (LazyCallGraph::Node &N : C) {
1982 Function &F = N.getFunction();
1983 if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile"))
1984 FunctionOrderList.push_back(&F);
1985 }
1986 }
1987 }
1988 }
1989
1990 std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
1991
1992 LLVM_DEBUG({
1993 dbgs() << "Function processing order:\n";
1994 for (auto F : FunctionOrderList) {
1995 dbgs() << F->getName() << "\n";
1996 }
1997 });
1998
1999 return FunctionOrderList;
2000}
2001
2002bool SampleProfileLoader::doInitialization(Module &M,
2004 auto &Ctx = M.getContext();
2005
2006 auto ReaderOrErr = SampleProfileReader::create(
2007 Filename, Ctx, *FS, FSDiscriminatorPass::Base, RemappingFilename);
2008 if (std::error_code EC = ReaderOrErr.getError()) {
2009 std::string Msg = "Could not open profile: " + EC.message();
2010 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
2011 return false;
2012 }
2013 Reader = std::move(ReaderOrErr.get());
2014 Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink);
2015 // set module before reading the profile so reader may be able to only
2016 // read the function profiles which are used by the current module.
2017 Reader->setModule(&M);
2018 if (std::error_code EC = Reader->read()) {
2019 std::string Msg = "profile reading failed: " + EC.message();
2020 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
2021 return false;
2022 }
2023
2024 PSL = Reader->getProfileSymbolList();
2025
2026 // While profile-sample-accurate is on, ignore symbol list.
2027 ProfAccForSymsInList =
2029 if (ProfAccForSymsInList) {
2030 NamesInProfile.clear();
2031 GUIDsInProfile.clear();
2032 if (auto NameTable = Reader->getNameTable()) {
2034 for (auto Name : *NameTable)
2035 GUIDsInProfile.insert(Name.getHashCode());
2036 } else {
2037 for (auto Name : *NameTable)
2038 NamesInProfile.insert(Name.stringRef());
2039 }
2040 }
2041 CoverageTracker.setProfAccForSymsInList(true);
2042 }
2043
2044 if (FAM && !ProfileInlineReplayFile.empty()) {
2045 ExternalInlineAdvisor = getReplayInlineAdvisor(
2046 M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr,
2051 /*EmitRemarks=*/false, InlineContext{LTOPhase, InlinePass::ReplaySampleProfileInliner});
2052 }
2053
2054 // Apply tweaks if context-sensitive or probe-based profile is available.
2055 if (Reader->profileIsCS() || Reader->profileIsPreInlined() ||
2056 Reader->profileIsProbeBased()) {
2057 if (!UseIterativeBFIInference.getNumOccurrences())
2059 if (!SampleProfileUseProfi.getNumOccurrences())
2060 SampleProfileUseProfi = true;
2061 if (!EnableExtTspBlockPlacement.getNumOccurrences())
2063 // Enable priority-base inliner and size inline by default for CSSPGO.
2064 if (!ProfileSizeInline.getNumOccurrences())
2065 ProfileSizeInline = true;
2066 if (!CallsitePrioritizedInline.getNumOccurrences())
2068 // For CSSPGO, we also allow recursive inline to best use context profile.
2069 if (!AllowRecursiveInline.getNumOccurrences())
2070 AllowRecursiveInline = true;
2071
2072 if (Reader->profileIsPreInlined()) {
2073 if (!UsePreInlinerDecision.getNumOccurrences())
2074 UsePreInlinerDecision = true;
2075 }
2076
2077 // Enable stale profile matching by default for probe-based profile.
2078 // Currently the matching relies on if the checksum mismatch is detected,
2079 // which is currently only available for pseudo-probe mode. Removing the
2080 // checksum check could cause regressions for some cases, so further tuning
2081 // might be needed if we want to enable it for all cases.
2082 if (Reader->profileIsProbeBased() &&
2083 !SalvageStaleProfile.getNumOccurrences()) {
2084 SalvageStaleProfile = true;
2085 }
2086
2087 if (!Reader->profileIsCS()) {
2088 // Non-CS profile should be fine without a function size budget for the
2089 // inliner since the contexts in the profile are either all from inlining
2090 // in the prevoius build or pre-computed by the preinliner with a size
2091 // cap, thus they are bounded.
2092 if (!ProfileInlineLimitMin.getNumOccurrences())
2093 ProfileInlineLimitMin = std::numeric_limits<unsigned>::max();
2094 if (!ProfileInlineLimitMax.getNumOccurrences())
2095 ProfileInlineLimitMax = std::numeric_limits<unsigned>::max();
2096 }
2097 }
2098
2099 if (Reader->profileIsCS()) {
2100 // Tracker for profiles under different context
2101 ContextTracker = std::make_unique<SampleContextTracker>(
2102 Reader->getProfiles(), &GUIDToFuncNameMap);
2103 }
2104
2105 // Load pseudo probe descriptors for probe-based function samples.
2106 if (Reader->profileIsProbeBased()) {
2107 ProbeManager = std::make_unique<PseudoProbeManager>(M);
2108 if (!ProbeManager->moduleIsProbed(M)) {
2109 const char *Msg =
2110 "Pseudo-probe-based profile requires SampleProfileProbePass";
2111 Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg,
2112 DS_Warning));
2113 return false;
2114 }
2115 }
2116
2119 MatchingManager =
2120 std::make_unique<SampleProfileMatcher>(M, *Reader, ProbeManager.get());
2121 }
2122
2123 return true;
2124}
2125
2126void SampleProfileMatcher::findIRAnchors(
2127 const Function &F, std::map<LineLocation, StringRef> &IRAnchors) {
2128 // For inlined code, recover the original callsite and callee by finding the
2129 // top-level inline frame. e.g. For frame stack "main:1 @ foo:2 @ bar:3", the
2130 // top-level frame is "main:1", the callsite is "1" and the callee is "foo".
2131 auto FindTopLevelInlinedCallsite = [](const DILocation *DIL) {
2132 assert((DIL && DIL->getInlinedAt()) && "No inlined callsite");
2133 const DILocation *PrevDIL = nullptr;
2134 do {
2135 PrevDIL = DIL;
2136 DIL = DIL->getInlinedAt();
2137 } while (DIL->getInlinedAt());
2138
2140 StringRef CalleeName = PrevDIL->getSubprogramLinkageName();
2141 return std::make_pair(Callsite, CalleeName);
2142 };
2143
2144 auto GetCanonicalCalleeName = [](const CallBase *CB) {
2145 StringRef CalleeName = UnknownIndirectCallee;
2146 if (Function *Callee = CB->getCalledFunction())
2147 CalleeName = FunctionSamples::getCanonicalFnName(Callee->getName());
2148 return CalleeName;
2149 };
2150
2151 // Extract profile matching anchors in the IR.
2152 for (auto &BB : F) {
2153 for (auto &I : BB) {
2154 DILocation *DIL = I.getDebugLoc();
2155 if (!DIL)
2156 continue;
2157
2159 if (auto Probe = extractProbe(I)) {
2160 // Flatten inlined IR for the matching.
2161 if (DIL->getInlinedAt()) {
2162 IRAnchors.emplace(FindTopLevelInlinedCallsite(DIL));
2163 } else {
2164 // Use empty StringRef for basic block probe.
2165 StringRef CalleeName;
2166 if (const auto *CB = dyn_cast<CallBase>(&I)) {
2167 // Skip the probe inst whose callee name is "llvm.pseudoprobe".
2168 if (!isa<IntrinsicInst>(&I))
2169 CalleeName = GetCanonicalCalleeName(CB);
2170 }
2171 IRAnchors.emplace(LineLocation(Probe->Id, 0), CalleeName);
2172 }
2173 }
2174 } else {
2175 // TODO: For line-number based profile(AutoFDO), currently only support
2176 // find callsite anchors. In future, we need to parse all the non-call
2177 // instructions to extract the line locations for profile matching.
2178 if (!isa<CallBase>(&I) || isa<IntrinsicInst>(&I))
2179 continue;
2180
2181 if (DIL->getInlinedAt()) {
2182 IRAnchors.emplace(FindTopLevelInlinedCallsite(DIL));
2183 } else {
2185 StringRef CalleeName = GetCanonicalCalleeName(dyn_cast<CallBase>(&I));
2186 IRAnchors.emplace(Callsite, CalleeName);
2187 }
2188 }
2189 }
2190 }
2191}
2192
2193void SampleProfileMatcher::countMismatchedSamples(const FunctionSamples &FS) {
2194 const auto *FuncDesc = ProbeManager->getDesc(FS.getGUID());
2195 // Skip the function that is external or renamed.
2196 if (!FuncDesc)
2197 return;
2198
2199 if (ProbeManager->profileIsHashMismatched(*FuncDesc, FS)) {
2200 MismatchedFuncHashSamples += FS.getTotalSamples();
2201 return;
2202 }
2203 for (const auto &I : FS.getCallsiteSamples())
2204 for (const auto &CS : I.second)
2205 countMismatchedSamples(CS.second);
2206}
2207
2208void SampleProfileMatcher::countProfileMismatches(
2209 const Function &F, const FunctionSamples &FS,
2210 const std::map<LineLocation, StringRef> &IRAnchors,
2211 const std::map<LineLocation, std::unordered_set<FunctionId>>
2212 &ProfileAnchors) {
2213 [[maybe_unused]] bool IsFuncHashMismatch = false;
2215 TotalFuncHashSamples += FS.getTotalSamples();
2216 TotalProfiledFunc++;
2217 const auto *FuncDesc = ProbeManager->getDesc(F);
2218 if (FuncDesc) {
2219 if (ProbeManager->profileIsHashMismatched(*FuncDesc, FS)) {
2220 NumMismatchedFuncHash++;
2221 IsFuncHashMismatch = true;
2222 }
2223 countMismatchedSamples(FS);
2224 }
2225 }
2226
2227 uint64_t FuncMismatchedCallsites = 0;
2228 uint64_t FuncProfiledCallsites = 0;
2229 countProfileCallsiteMismatches(FS, IRAnchors, ProfileAnchors,
2230 FuncMismatchedCallsites,
2231 FuncProfiledCallsites);
2232 TotalProfiledCallsites += FuncProfiledCallsites;
2233 NumMismatchedCallsites += FuncMismatchedCallsites;
2234 LLVM_DEBUG({
2235 if (FunctionSamples::ProfileIsProbeBased && !IsFuncHashMismatch &&
2236 FuncMismatchedCallsites)
2237 dbgs() << "Function checksum is matched but there are "
2238 << FuncMismatchedCallsites << "/" << FuncProfiledCallsites
2239 << " mismatched callsites.\n";
2240 });
2241}
2242
2243void SampleProfileMatcher::countProfileCallsiteMismatches(
2244 const FunctionSamples &FS,
2245 const std::map<LineLocation, StringRef> &IRAnchors,
2246 const std::map<LineLocation, std::unordered_set<FunctionId>>
2247 &ProfileAnchors,
2248 uint64_t &FuncMismatchedCallsites, uint64_t &FuncProfiledCallsites) {
2249
2250 // Check if there are any callsites in the profile that does not match to any
2251 // IR callsites, those callsite samples will be discarded.
2252 for (const auto &I : ProfileAnchors) {
2253 const auto &Loc = I.first;
2254 const auto &Callees = I.second;
2255 assert(!Callees.empty() && "Callees should not be empty");
2256
2257 StringRef IRCalleeName;
2258 const auto &IR = IRAnchors.find(Loc);
2259 if (IR != IRAnchors.end())
2260 IRCalleeName = IR->second;
2261
2262 // Compute number of samples in the original profile.
2263 uint64_t CallsiteSamples = 0;
2264 auto CTM = FS.findCallTargetMapAt(Loc);
2265 if (CTM) {
2266 for (const auto &I : CTM.get())
2267 CallsiteSamples += I.second;
2268 }
2269 const auto *FSMap = FS.findFunctionSamplesMapAt(Loc);
2270 if (FSMap) {
2271 for (const auto &I : *FSMap)
2272 CallsiteSamples += I.second.getTotalSamples();
2273 }
2274
2275 bool CallsiteIsMatched = false;
2276 // Since indirect call does not have CalleeName, check conservatively if
2277 // callsite in the profile is a callsite location. This is to reduce num of
2278 // false positive since otherwise all the indirect call samples will be
2279 // reported as mismatching.
2280 if (IRCalleeName == UnknownIndirectCallee)
2281 CallsiteIsMatched = true;
2282 else if (Callees.size() == 1 && Callees.count(getRepInFormat(IRCalleeName)))
2283 CallsiteIsMatched = true;
2284
2285 FuncProfiledCallsites++;
2286 TotalCallsiteSamples += CallsiteSamples;
2287 if (!CallsiteIsMatched) {
2288 FuncMismatchedCallsites++;
2289 MismatchedCallsiteSamples += CallsiteSamples;
2290 }
2291 }
2292}
2293
2294void SampleProfileMatcher::findProfileAnchors(const FunctionSamples &FS,
2295 std::map<LineLocation, std::unordered_set<FunctionId>> &ProfileAnchors) {
2296 auto isInvalidLineOffset = [](uint32_t LineOffset) {
2297 return LineOffset & 0x8000;
2298 };
2299
2300 for (const auto &I : FS.getBodySamples()) {
2301 const LineLocation &Loc = I.first;
2302 if (isInvalidLineOffset(Loc.LineOffset))
2303 continue;
2304 for (const auto &I : I.second.getCallTargets()) {
2305 auto Ret = ProfileAnchors.try_emplace(Loc,
2306 std::unordered_set<FunctionId>());
2307 Ret.first->second.insert(I.first);
2308 }
2309 }
2310
2311 for (const auto &I : FS.getCallsiteSamples()) {
2312 const LineLocation &Loc = I.first;
2313 if (isInvalidLineOffset(Loc.LineOffset))
2314 continue;
2315 const auto &CalleeMap = I.second;
2316 for (const auto &I : CalleeMap) {
2317 auto Ret = ProfileAnchors.try_emplace(Loc,
2318 std::unordered_set<FunctionId>());
2319 Ret.first->second.insert(I.first);
2320 }
2321 }
2322}
2323
2324// Call target name anchor based profile fuzzy matching.
2325// Input:
2326// For IR locations, the anchor is the callee name of direct callsite; For
2327// profile locations, it's the call target name for BodySamples or inlinee's
2328// profile name for CallsiteSamples.
2329// Matching heuristic:
2330// First match all the anchors in lexical order, then split the non-anchor
2331// locations between the two anchors evenly, first half are matched based on the
2332// start anchor, second half are matched based on the end anchor.
2333// For example, given:
2334// IR locations: [1, 2(foo), 3, 5, 6(bar), 7]
2335// Profile locations: [1, 2, 3(foo), 4, 7, 8(bar), 9]
2336// The matching gives:
2337// [1, 2(foo), 3, 5, 6(bar), 7]
2338// | | | | | |
2339// [1, 2, 3(foo), 4, 7, 8(bar), 9]
2340// The output mapping: [2->3, 3->4, 5->7, 6->8, 7->9].
2341void SampleProfileMatcher::runStaleProfileMatching(
2342 const Function &F,
2343 const std::map<LineLocation, StringRef> &IRAnchors,
2344 const std::map<LineLocation, std::unordered_set<FunctionId>>
2345 &ProfileAnchors,
2346 LocToLocMap &IRToProfileLocationMap) {
2347 LLVM_DEBUG(dbgs() << "Run stale profile matching for " << F.getName()
2348 << "\n");
2349 assert(IRToProfileLocationMap.empty() &&
2350 "Run stale profile matching only once per function");
2351
2352 std::unordered_map<FunctionId, std::set<LineLocation>>
2353 CalleeToCallsitesMap;
2354 for (const auto &I : ProfileAnchors) {
2355 const auto &Loc = I.first;
2356 const auto &Callees = I.second;
2357 // Filter out possible indirect calls, use direct callee name as anchor.
2358 if (Callees.size() == 1) {
2359 FunctionId CalleeName = *Callees.begin();
2360 const auto &Candidates = CalleeToCallsitesMap.try_emplace(
2361 CalleeName, std::set<LineLocation>());
2362 Candidates.first->second.insert(Loc);
2363 }
2364 }
2365
2366 auto InsertMatching = [&](const LineLocation &From, const LineLocation &To) {
2367 // Skip the unchanged location mapping to save memory.
2368 if (From != To)
2369 IRToProfileLocationMap.insert({From, To});
2370 };
2371
2372 // Use function's beginning location as the initial anchor.
2373 int32_t LocationDelta = 0;
2374 SmallVector<LineLocation> LastMatchedNonAnchors;
2375
2376 for (const auto &IR : IRAnchors) {
2377 const auto &Loc = IR.first;
2378 auto CalleeName = IR.second;
2379 bool IsMatchedAnchor = false;
2380 // Match the anchor location in lexical order.
2381 if (!CalleeName.empty()) {
2382 auto CandidateAnchors = CalleeToCallsitesMap.find(
2383 getRepInFormat(CalleeName));
2384 if (CandidateAnchors != CalleeToCallsitesMap.end() &&
2385 !CandidateAnchors->second.empty()) {
2386 auto CI = CandidateAnchors->second.begin();
2387 const auto Candidate = *CI;
2388 CandidateAnchors->second.erase(CI);
2389 InsertMatching(Loc, Candidate);
2390 LLVM_DEBUG(dbgs() << "Callsite with callee:" << CalleeName
2391 << " is matched from " << Loc << " to " << Candidate
2392 << "\n");
2393 LocationDelta = Candidate.LineOffset - Loc.LineOffset;
2394
2395 // Match backwards for non-anchor locations.
2396 // The locations in LastMatchedNonAnchors have been matched forwards
2397 // based on the previous anchor, spilt it evenly and overwrite the
2398 // second half based on the current anchor.
2399 for (size_t I = (LastMatchedNonAnchors.size() + 1) / 2;
2400 I < LastMatchedNonAnchors.size(); I++) {
2401 const auto &L = LastMatchedNonAnchors[I];
2402 uint32_t CandidateLineOffset = L.LineOffset + LocationDelta;
2403 LineLocation Candidate(CandidateLineOffset, L.Discriminator);
2404 InsertMatching(L, Candidate);
2405 LLVM_DEBUG(dbgs() << "Location is rematched backwards from " << L
2406 << " to " << Candidate << "\n");
2407 }
2408
2409 IsMatchedAnchor = true;
2410 LastMatchedNonAnchors.clear();
2411 }
2412 }
2413
2414 // Match forwards for non-anchor locations.
2415 if (!IsMatchedAnchor) {
2416 uint32_t CandidateLineOffset = Loc.LineOffset + LocationDelta;
2417 LineLocation Candidate(CandidateLineOffset, Loc.Discriminator);
2418 InsertMatching(Loc, Candidate);
2419 LLVM_DEBUG(dbgs() << "Location is matched from " << Loc << " to "
2420 << Candidate << "\n");
2421 LastMatchedNonAnchors.emplace_back(Loc);
2422 }
2423 }
2424}
2425
2426void SampleProfileMatcher::runOnFunction(const Function &F) {
2427 // We need to use flattened function samples for matching.
2428 // Unlike IR, which includes all callsites from the source code, the callsites
2429 // in profile only show up when they are hit by samples, i,e. the profile
2430 // callsites in one context may differ from those in another context. To get
2431 // the maximum number of callsites, we merge the function profiles from all
2432 // contexts, aka, the flattened profile to find profile anchors.
2433 const auto *FSFlattened = getFlattenedSamplesFor(F);
2434 if (!FSFlattened)
2435 return;
2436
2437 // Anchors for IR. It's a map from IR location to callee name, callee name is
2438 // empty for non-call instruction and use a dummy name(UnknownIndirectCallee)
2439 // for unknown indrect callee name.
2440 std::map<LineLocation, StringRef> IRAnchors;
2441 findIRAnchors(F, IRAnchors);
2442 // Anchors for profile. It's a map from callsite location to a set of callee
2443 // name.
2444 std::map<LineLocation, std::unordered_set<FunctionId>> ProfileAnchors;
2445 findProfileAnchors(*FSFlattened, ProfileAnchors);
2446
2447 // Detect profile mismatch for profile staleness metrics report.
2448 // Skip reporting the metrics for imported functions.
2449 if (!GlobalValue::isAvailableExternallyLinkage(F.getLinkage()) &&
2451 // Use top-level nested FS for counting profile mismatch metrics since
2452 // currently once a callsite is mismatched, all its children profiles are
2453 // dropped.
2454 if (const auto *FS = Reader.getSamplesFor(F))
2455 countProfileMismatches(F, *FS, IRAnchors, ProfileAnchors);
2456 }
2457
2458 // Run profile matching for checksum mismatched profile, currently only
2459 // support for pseudo-probe.
2461 !ProbeManager->profileIsValid(F, *FSFlattened)) {
2462 // The matching result will be saved to IRToProfileLocationMap, create a new
2463 // map for each function.
2464 runStaleProfileMatching(F, IRAnchors, ProfileAnchors,
2465 getIRToProfileLocationMap(F));
2466 }
2467}
2468
2469void SampleProfileMatcher::runOnModule() {
2470 ProfileConverter::flattenProfile(Reader.getProfiles(), FlattenedProfiles,
2472 for (auto &F : M) {
2473 if (F.isDeclaration() || !F.hasFnAttribute("use-sample-profile"))
2474 continue;
2476 }
2478 distributeIRToProfileLocationMap();
2479
2482 errs() << "(" << NumMismatchedFuncHash << "/" << TotalProfiledFunc << ")"
2483 << " of functions' profile are invalid and "
2484 << " (" << MismatchedFuncHashSamples << "/" << TotalFuncHashSamples
2485 << ")"
2486 << " of samples are discarded due to function hash mismatch.\n";
2487 }
2488 errs() << "(" << NumMismatchedCallsites << "/" << TotalProfiledCallsites
2489 << ")"
2490 << " of callsites' profile are invalid and "
2491 << "(" << MismatchedCallsiteSamples << "/" << TotalCallsiteSamples
2492 << ")"
2493 << " of samples are discarded due to callsite location mismatch.\n";
2494 }
2495
2497 LLVMContext &Ctx = M.getContext();
2498 MDBuilder MDB(Ctx);
2499
2502 ProfStatsVec.emplace_back("NumMismatchedFuncHash", NumMismatchedFuncHash);
2503 ProfStatsVec.emplace_back("TotalProfiledFunc", TotalProfiledFunc);
2504 ProfStatsVec.emplace_back("MismatchedFuncHashSamples",
2505 MismatchedFuncHashSamples);
2506 ProfStatsVec.emplace_back("TotalFuncHashSamples", TotalFuncHashSamples);
2507 }
2508
2509 ProfStatsVec.emplace_back("NumMismatchedCallsites", NumMismatchedCallsites);
2510 ProfStatsVec.emplace_back("TotalProfiledCallsites", TotalProfiledCallsites);
2511 ProfStatsVec.emplace_back("MismatchedCallsiteSamples",
2512 MismatchedCallsiteSamples);
2513 ProfStatsVec.emplace_back("TotalCallsiteSamples", TotalCallsiteSamples);
2514
2515 auto *MD = MDB.createLLVMStats(ProfStatsVec);
2516 auto *NMD = M.getOrInsertNamedMetadata("llvm.stats");
2517 NMD->addOperand(MD);
2518 }
2519}
2520
2521void SampleProfileMatcher::distributeIRToProfileLocationMap(
2522 FunctionSamples &FS) {
2523 const auto ProfileMappings = FuncMappings.find(FS.getFuncName());
2524 if (ProfileMappings != FuncMappings.end()) {
2525 FS.setIRToProfileLocationMap(&(ProfileMappings->second));
2526 }
2527
2528 for (auto &Inlinees : FS.getCallsiteSamples()) {
2529 for (auto FS : Inlinees.second) {
2530 distributeIRToProfileLocationMap(FS.second);
2531 }
2532 }
2533}
2534
2535// Use a central place to distribute the matching results. Outlined and inlined
2536// profile with the function name will be set to the same pointer.
2537void SampleProfileMatcher::distributeIRToProfileLocationMap() {
2538 for (auto &I : Reader.getProfiles()) {
2539 distributeIRToProfileLocationMap(I.second);
2540 }
2541}
2542
2543bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
2544 ProfileSummaryInfo *_PSI,
2545 LazyCallGraph &CG) {
2546 GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
2547
2548 PSI = _PSI;
2549 if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
2550 M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
2552 PSI->refresh();
2553 }
2554 // Compute the total number of samples collected in this profile.
2555 for (const auto &I : Reader->getProfiles())
2556 TotalCollectedSamples += I.second.getTotalSamples();
2557
2558 auto Remapper = Reader->getRemapper();
2559 // Populate the symbol map.
2560 for (const auto &N_F : M.getValueSymbolTable()) {
2561 StringRef OrigName = N_F.getKey();
2562 Function *F = dyn_cast<Function>(N_F.getValue());
2563 if (F == nullptr || OrigName.empty())
2564 continue;
2565 SymbolMap[FunctionId(OrigName)] = F;
2567 if (OrigName != NewName && !NewName.empty()) {
2568 auto r = SymbolMap.emplace(FunctionId(NewName), F);
2569 // Failiing to insert means there is already an entry in SymbolMap,
2570 // thus there are multiple functions that are mapped to the same
2571 // stripped name. In this case of name conflicting, set the value
2572 // to nullptr to avoid confusion.
2573 if (!r.second)
2574 r.first->second = nullptr;
2575 OrigName = NewName;
2576 }
2577 // Insert the remapped names into SymbolMap.
2578 if (Remapper) {
2579 if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
2580 if (*MapName != OrigName && !MapName->empty())
2581 SymbolMap.emplace(FunctionId(*MapName), F);
2582 }
2583 }
2584 }
2585 assert(SymbolMap.count(FunctionId()) == 0 &&
2586 "No empty StringRef should be added in SymbolMap");
2587
2590 MatchingManager->runOnModule();
2591 }
2592
2593 bool retval = false;
2594 for (auto *F : buildFunctionOrder(M, CG)) {
2595 assert(!F->isDeclaration());
2596 clearFunctionData();
2597 retval |= runOnFunction(*F, AM);
2598 }
2599
2600 // Account for cold calls not inlined....
2602 for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
2603 notInlinedCallInfo)
2604 updateProfileCallee(pair.first, pair.second.entryCount);
2605
2606 return retval;
2607}
2608
2609bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
2610 LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
2611 DILocation2SampleMap.clear();
2612 // By default the entry count is initialized to -1, which will be treated
2613 // conservatively by getEntryCount as the same as unknown (None). This is
2614 // to avoid newly added code to be treated as cold. If we have samples
2615 // this will be overwritten in emitAnnotations.
2616 uint64_t initialEntryCount = -1;
2617
2618 ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
2619 if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
2620 // initialize all the function entry counts to 0. It means all the
2621 // functions without profile will be regarded as cold.
2622 initialEntryCount = 0;
2623 // profile-sample-accurate is a user assertion which has a higher precedence
2624 // than symbol list. When profile-sample-accurate is on, ignore symbol list.
2625 ProfAccForSymsInList = false;
2626 }
2627 CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList);
2628
2629 // PSL -- profile symbol list include all the symbols in sampled binary.
2630 // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
2631 // old functions without samples being cold, without having to worry
2632 // about new and hot functions being mistakenly treated as cold.
2633 if (ProfAccForSymsInList) {
2634 // Initialize the entry count to 0 for functions in the list.
2635 if (PSL->contains(F.getName()))
2636 initialEntryCount = 0;
2637
2638 // Function in the symbol list but without sample will be regarded as
2639 // cold. To minimize the potential negative performance impact it could
2640 // have, we want to be a little conservative here saying if a function
2641 // shows up in the profile, no matter as outline function, inline instance
2642 // or call targets, treat the function as not being cold. This will handle
2643 // the cases such as most callsites of a function are inlined in sampled
2644 // binary but not inlined in current build (because of source code drift,
2645 // imprecise debug information, or the callsites are all cold individually
2646 // but not cold accumulatively...), so the outline function showing up as
2647 // cold in sampled binary will actually not be cold after current build.
2650 GUIDsInProfile.count(Function::getGUID(CanonName))) ||
2651 (!FunctionSamples::UseMD5 && NamesInProfile.count(CanonName)))
2652 initialEntryCount = -1;
2653 }
2654
2655 // Initialize entry count when the function has no existing entry
2656 // count value.
2657 if (!F.getEntryCount())
2658 F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
2659 std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
2660 if (AM) {
2661 auto &FAM =
2663 .getManager();
2665 } else {
2666 OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
2667 ORE = OwnedORE.get();
2668 }
2669
2671 Samples = ContextTracker->getBaseSamplesFor(F);
2672 else {
2673 Samples = Reader->getSamplesFor(F);
2674 // Try search in previously inlined functions that were split or duplicated
2675 // into base.
2676 if (!Samples) {
2678 auto It = OutlineFunctionSamples.find(FunctionId(CanonName));
2679 if (It != OutlineFunctionSamples.end()) {
2680 Samples = &It->second;
2681 } else if (auto Remapper = Reader->getRemapper()) {
2682 if (auto RemppedName = Remapper->lookUpNameInProfile(CanonName)) {
2683 It = OutlineFunctionSamples.find(FunctionId(*RemppedName));
2684 if (It != OutlineFunctionSamples.end())
2685 Samples = &It->second;
2686 }
2687 }
2688 }
2689 }
2690
2691 if (Samples && !Samples->empty())
2692 return emitAnnotations(F);
2693 return false;
2694}
2696 std::string File, std::string RemappingFile, ThinOrFullLTOPhase LTOPhase,
2698 : ProfileFileName(File), ProfileRemappingFileName(RemappingFile),
2699 LTOPhase(LTOPhase), FS(std::move(FS)) {}
2700
2705
2706 auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
2708 };
2709 auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
2711 };
2712 auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
2714 };
2715
2716 if (!FS)
2718
2719 SampleProfileLoader SampleLoader(
2720 ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
2721 ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
2722 : ProfileRemappingFileName,
2723 LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI);
2724
2725 if (!SampleLoader.doInitialization(M, &FAM))
2726 return PreservedAnalyses::all();
2727
2730 if (!SampleLoader.runOnModule(M, &AM, PSI, CG))
2731 return PreservedAnalyses::all();
2732
2733 return PreservedAnalyses::none();
2734}
This file defines the StringMap class.
BlockVerifier::State From
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:680
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
static bool runOnFunction(Function &F, bool PostInlining)
Provides ErrorOr<T> smart pointer.
static cl::opt< unsigned > SizeLimit("eif-limit", cl::init(6), cl::Hidden, cl::desc("Size limit in Hexagon early if-conversion"))
LVReader * CurrentReader
Definition: LVReader.cpp:153
Implements a lazy call graph analysis and related passes for the new pass manager.
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:81
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file implements a map that provides insertion order iteration.
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
FunctionAnalysisManager FAM
This header defines various interfaces for pass management in LLVM.
This file defines the PriorityQueue class.
This file contains the declarations for profiling metadata utility functions.
This builds on the llvm/ADT/GraphTraits.h file to find the strongly connected components (SCCs) of a ...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file provides the interface for context-sensitive profile tracker used by CSSPGO.
This file provides the interface for the sampled PGO profile loader base implementation.
This file provides the utility functions for the sampled PGO loader base implementation.
This file provides the interface for the pseudo probe implementation for AutoFDO.
static cl::opt< std::string > SampleProfileFile("sample-profile-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile file loaded by -sample-profile"), cl::Hidden)
static cl::opt< bool > ProfileSampleBlockAccurate("profile-sample-block-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "branches and calls as having 0 samples. Otherwise, treat " "them conservatively as unknown. "))
static cl::opt< unsigned > MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden, cl::desc("Max number of promotions for a single indirect " "call callsite in sample profile loader"))
static cl::opt< ReplayInlinerSettings::Fallback > ProfileInlineReplayFallback("sample-profile-inline-replay-fallback", cl::init(ReplayInlinerSettings::Fallback::Original), cl::values(clEnumValN(ReplayInlinerSettings::Fallback::Original, "Original", "All decisions not in replay send to original advisor (default)"), clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline, "AlwaysInline", "All decisions not in replay are inlined"), clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline", "All decisions not in replay are not inlined")), cl::desc("How sample profile inline replay treats sites that don't come " "from the replay. Original: defers to original advisor, " "AlwaysInline: inline all sites not in replay, NeverInline: " "inline no sites not in replay"), cl::Hidden)
static cl::opt< bool > OverwriteExistingWeights("overwrite-existing-weights", cl::Hidden, cl::init(false), cl::desc("Ignore existing branch weights on IR and always overwrite."))
static void updateIDTMetaData(Instruction &Inst, const SmallVectorImpl< InstrProfValueData > &CallTargets, uint64_t Sum)
Update indirect call target profile metadata for Inst.
static cl::opt< bool > AnnotateSampleProfileInlinePhase("annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false), cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for " "sample-profile inline pass name."))
static cl::opt< std::string > ProfileInlineReplayFile("sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"), cl::desc("Optimization remarks file containing inline remarks to be replayed " "by inlining from sample profile loader."), cl::Hidden)
static cl::opt< bool > ProfileMergeInlinee("sample-profile-merge-inlinee", cl::Hidden, cl::init(true), cl::desc("Merge past inlinee's profile to outline version if sample " "profile loader decided not to inline a call site. It will " "only be enabled when top-down order of profile loading is " "enabled. "))
static cl::opt< bool > PersistProfileStaleness("persist-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute stale profile statistical metrics and write it into the " "native object file(.llvm_stats section)."))
static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate)
Check whether the indirect call promotion history of Inst allows the promotion for Candidate.
static SmallVector< InstrProfValueData, 2 > GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M)
Returns the sorted CallTargetMap M by count in descending order.
#define CSINLINE_DEBUG
static cl::opt< bool > UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden, cl::desc("Process functions in a top-down order " "defined by the profiled call graph when " "-sample-profile-top-down-load is on."))
static cl::opt< ReplayInlinerSettings::Scope > ProfileInlineReplayScope("sample-profile-inline-replay-scope", cl::init(ReplayInlinerSettings::Scope::Function), cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function", "Replay on functions that have remarks associated " "with them (default)"), clEnumValN(ReplayInlinerSettings::Scope::Module, "Module", "Replay on the entire module")), cl::desc("Whether inline replay should be applied to the entire " "Module or just the Functions (default) that are present as " "callers in remarks during sample profile inlining."), cl::Hidden)
static cl::opt< unsigned > ProfileICPRelativeHotness("sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25), cl::desc("Relative hotness percentage threshold for indirect " "call promotion in proirity-based sample profile loader inlining."))
Function::ProfileCount ProfileCount
static cl::opt< unsigned > ProfileICPRelativeHotnessSkip("sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1), cl::desc("Skip relative hotness check for ICP up to given number of targets."))
static cl::opt< bool > ReportProfileStaleness("report-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute and report stale profile statistical metrics."))
static cl::opt< bool > UsePreInlinerDecision("sample-profile-use-preinliner", cl::Hidden, cl::desc("Use the preinliner decisions stored in profile context."))
static cl::opt< bool > ProfileAccurateForSymsInList("profile-accurate-for-symsinlist", cl::Hidden, cl::init(true), cl::desc("For symbols in profile symbol list, regard their profiles to " "be accurate. It may be overriden by profile-sample-accurate. "))
#define DEBUG_TYPE
static cl::opt< bool > DisableSampleLoaderInlining("disable-sample-loader-inlining", cl::Hidden, cl::init(false), cl::desc("If true, artifically skip inline transformation in sample-loader " "pass, and merge (or scale) profiles (as configured by " "--sample-profile-merge-inlinee)."))
static cl::opt< bool > ProfileSizeInline("sample-profile-inline-size", cl::Hidden, cl::init(false), cl::desc("Inline cold call sites in profile loader if it's beneficial " "for code size."))
static cl::opt< bool > SalvageStaleProfile("salvage-stale-profile", cl::Hidden, cl::init(false), cl::desc("Salvage stale profile by fuzzy matching and use the remapped " "location for sample profile query."))
static cl::opt< bool > ProfileTopDownLoad("sample-profile-top-down-load", cl::Hidden, cl::init(true), cl::desc("Do profile annotation and inlining for functions in top-down " "order of call graph during sample profile loading. It only " "works for new pass manager. "))
static cl::opt< bool > ProfileSampleAccurate("profile-sample-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "callsite and function as having 0 samples. Otherwise, treat " "un-sampled callsites and functions conservatively as unknown. "))
static cl::opt< bool > AllowRecursiveInline("sample-profile-recursive-inline", cl::Hidden, cl::desc("Allow sample loader inliner to inline recursive calls."))
static cl::opt< CallSiteFormat::Format > ProfileInlineReplayFormat("sample-profile-inline-replay-format", cl::init(CallSiteFormat::Format::LineColumnDiscriminator), cl::values(clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"), clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn", "<Line Number>:<Column Number>"), clEnumValN(CallSiteFormat::Format::LineDiscriminator, "LineDiscriminator", "<Line Number>.<Discriminator>"), clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator, "LineColumnDiscriminator", "<Line Number>:<Column Number>.<Discriminator> (default)")), cl::desc("How sample profile inline replay file is formatted"), cl::Hidden)
static cl::opt< std::string > SampleProfileRemappingFile("sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden)
static cl::opt< bool > CallsitePrioritizedInline("sample-profile-prioritized-inline", cl::Hidden, cl::desc("Use call site prioritized inlining for sample profile loader." "Currently only CSSPGO is supported."))
This file provides the interface for the sampled PGO loader pass.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:649
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:803
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:213
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1227
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1449
This class represents a function call, abstracting a target machine's calling convention.
Debug location.
A debug info location.
Definition: DebugLoc.h:33
unsigned getLine() const
Definition: DebugLoc.cpp:24
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Diagnostic information for the sample profiler.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165
Represents either an error or a value T.
Definition: ErrorOr.h:56
Class to represent profile counts.
Definition: Function.h:277
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1776
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:273
static bool isAvailableExternallyLinkage(LinkageTypes Linkage)
Definition: GlobalValue.h:374
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:652
Represents the cost of inlining a function.
Definition: InlineCost.h:89
static InlineCost getNever(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:130
static InlineCost getAlways(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:125
static InlineCost get(int Cost, int Threshold, int StaticBonus=0)
Definition: InlineCost.h:119
This class captures the data input to the InlineFunction call, and records the auxiliary results prod...
Definition: Cloning.h:202
InlineResult is basically true or false.
Definition: InlineCost.h:179
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:962
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:438
bool extractProfTotalWeight(uint64_t &TotalVal) const
Retrieve total raw weight values of a branch.
Definition: Metadata.cpp:1690
const BasicBlock * getParent() const
Definition: Instruction.h:139
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1581
A smart pointer to a reference-counted object that inherits from RefCountedBase or ThreadSafeRefCount...
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
An analysis pass which computes the call graph for a module.
A node in the call graph.
A RefSCC of the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
iterator_range< postorder_ref_scc_iterator > postorder_ref_sccs()
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
VectorType::iterator erase(typename VectorType::iterator Iterator)
Remove the element given by Iterator.
Definition: MapVector.h:193
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Diagnostic information for optimization analysis remarks.
Diagnostic information for applied optimization remarks.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:172
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:175
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:178
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
Metadata * getMD(LLVMContext &Context, bool AddPartialField=true, bool AddPartialProfileRatioField=true)
Return summary information as metadata.
bool profileIsHashMismatched(const PseudoProbeDescriptor &FuncDesc, const FunctionSamples &Samples) const
bool moduleIsProbed(const Module &M) const
bool profileIsValid(const Function &F, const FunctionSamples &Samples) const
const PseudoProbeDescriptor * getDesc(uint64_t GUID) const
Sample profile inference pass.
void computeDominanceAndLoopInfo(FunctionT &F)
virtual ErrorOr< uint64_t > getInstWeight(const InstructionT &Inst)
Get the weight for an instruction.
virtual const FunctionSamples * findFunctionSamples(const InstructionT &I) const
Get the FunctionSamples for an instruction.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
SampleProfileLoaderPass(std::string File="", std::string RemappingFile="", ThinOrFullLTOPhase LTOPhase=ThinOrFullLTOPhase::None, IntrusiveRefCntPtr< vfs::FileSystem > FS=nullptr)
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:809
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:112
iterator end()
Definition: StringMap.h:205
iterator find(StringRef Key)
Definition: StringMap.h:218
std::pair< iterator, bool > try_emplace(StringRef Key, ArgsTy &&...Args)
Emplace a new element for the specified key into the map if the key isn't already in the map.
Definition: StringMap.h:341
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
iterator begin() const
Definition: StringRef.h:111
StringSet - A wrapper for StringMap that provides set-like functionality.
Definition: StringSet.h:23
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
This class represents a function that is read from a sample profile.
Definition: FunctionId.h:36
Representation of the samples collected for a function.
Definition: SampleProf.h:744
void findInlinedFunctions(DenseSet< GlobalValue::GUID > &S, const HashKeyMap< std::unordered_map, FunctionId, Function * > &SymbolMap, uint64_t Threshold) const
Recursively traverses all children, if the total sample count of the corresponding function is no les...
Definition: SampleProf.h:1036
FunctionId getFunction() const
Return the function name.
Definition: SampleProf.h:1069
static StringRef getCanonicalFnName(const Function &F)
Return the canonical name for a function, taking into account suffix elision policy attributes.
Definition: SampleProf.h:1085
SampleContext & getContext() const
Definition: SampleProf.h:1185
sampleprof_error merge(const FunctionSamples &Other, uint64_t Weight=1)
Merge the samples in Other into this one.
Definition: SampleProf.h:996
static LineLocation getCallSiteIdentifier(const DILocation *DIL, bool ProfileIsFS=false)
Returns a unique call site identifier for a given debug location of a call instruction.
Definition: SampleProf.cpp:221
uint64_t getHeadSamplesEstimate() const
Return an estimate of the sample count of the function entry basic block.
Definition: SampleProf.h:947
uint64_t getGUID() const
Return the GUID of the context's name.
Definition: SampleProf.h:1204
const BodySampleMap & getBodySamples() const
Return all the samples collected in the body of the function.
Definition: SampleProf.h:971
static bool UseMD5
Whether the profile uses MD5 to represent string.
Definition: SampleProf.h:1190
This class is a wrapper to associative container MapT<KeyT, ValueT> using the hash value of the origi...
Definition: HashKeyMap.h:53
static void flattenProfile(SampleProfileMap &ProfileMap, bool ProfileIsCS=false)
Definition: SampleProf.h:1415
bool hasAttribute(ContextAttributeMask A)
Definition: SampleProf.h:607
This class provides operator overloads to the map container using MD5 as the key type,...
Definition: SampleProf.h:1306
iterator find(const SampleContext &Ctx)
Definition: SampleProf.h:1317
Sample-based profile reader.
SampleProfileMap & getProfiles()
Return all the profiles.
bool profileIsProbeBased() const
Whether input profile is based on pseudo probes.
FunctionSamples * getSamplesFor(const Function &F)
Return the samples collected for function F.
bool profileIsPreInlined() const
Whether input profile contains ShouldBeInlined contexts.
std::error_code read()
The interface to read sample profiles from the associated file.
SampleProfileReaderItaniumRemapper * getRemapper()
ProfileSummary & getSummary() const
Return the profile summary.
virtual std::vector< FunctionId > * getNameTable()
It includes all the names that have samples either in outline instance or inline instance.
bool profileIsCS() const
Whether input profile is fully context-sensitive.
virtual void setSkipFlatProf(bool Skip)
Don't read profile without context if the flag is set.
static ErrorOr< std::unique_ptr< SampleProfileReader > > create(const std::string Filename, LLVMContext &C, vfs::FileSystem &FS, FSDiscriminatorPass P=FSDiscriminatorPass::Base, const std::string RemapFilename="")
Create a sample profile reader appropriate to the file format.
virtual std::unique_ptr< ProfileSymbolList > getProfileSymbolList()
std::unordered_map< FunctionId, uint64_t > CallTargetMap
Definition: SampleProf.h:338
static const SortedCallTargetSet SortCallTargets(const CallTargetMap &Targets)
Sort call targets in descending order of call frequency.
Definition: SampleProf.h:406
static const CallTargetMap adjustCallTargets(const CallTargetMap &Targets, float DistributionFactor)
Prorate call targets by a distribution factor.
Definition: SampleProf.h:415
Enumerate the SCCs of a directed graph in reverse topological order of the SCC DAG.
Definition: SCCIterator.h:49
bool isAtEnd() const
Direct loop termination test which is more efficient than comparison with end().
Definition: SCCIterator.h:113
Sort the nodes of a directed SCC in the decreasing order of the edge weights.
Definition: SCCIterator.h:253
const CustomOperand< const MCSubtargetInfo & > Msg[]
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ FS
Definition: X86.h:206
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:705
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
void checkExpectAnnotations(Instruction &I, const ArrayRef< uint32_t > ExistingWeights, bool IsFrontend)
checkExpectAnnotations - compares PGO counters to the thresholds used for llvm.expect and warns if th...
Definition: MisExpect.cpp:202
DenseMap< SymbolStringPtr, ExecutorSymbolDef > SymbolMap
A map from symbol names (as SymbolStringPtrs) to JITSymbols (address/flags pairs).
Definition: Core.h:121
DiagnosticInfoOptimizationBase::Argument NV
CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
static FunctionId getRepInFormat(StringRef Name)
Get the proper representation of a string according to whether the current Format uses MD5 to represe...
Definition: SampleProf.h:1292
std::unordered_map< LineLocation, LineLocation, LineLocationHash > LocToLocMap
Definition: SampleProf.h:737
std::map< FunctionId, FunctionSamples > FunctionSamplesMap
Definition: SampleProf.h:734
bool callsiteIsHot(const FunctionSamples *CallsiteFS, ProfileSummaryInfo *PSI, bool ProfAccForSymsInList)
Return true if the given callsite is hot wrt to hot cutoff threshold.
IntrusiveRefCntPtr< FileSystem > getRealFileSystem()
Gets an vfs::FileSystem for the 'real' file system, as seen by the operating system.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind, uint32_t MaxNumValueData, InstrProfValueData ValueData[], uint32_t &ActualNumValueData, uint64_t &TotalC, bool GetNoICPValue=false)
Extract the value profile data from Inst which is annotated with value profile meta data.
Definition: InstrProf.cpp:1202
bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1684
cl::opt< int > ProfileInlineLimitMin
bool succ_empty(const Instruction *I)
Definition: CFG.h:255
scc_iterator< T > scc_begin(const T &G)
Construct the begin iterator for a deduced graph type T.
Definition: SCCIterator.h:233
void setProbeDistributionFactor(Instruction &Inst, float Factor)
Definition: PseudoProbe.cpp:76
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
std::string AnnotateInlinePassName(InlineContext IC)
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
Definition: Pass.h:76
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
cl::opt< bool > SampleProfileUseProfi
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:1157
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1651
llvm::cl::opt< bool > UseIterativeBFIInference
std::optional< PseudoProbe > extractProbe(const Instruction &Inst)
Definition: PseudoProbe.cpp:56
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void emitInlinedIntoBasedOnCost(OptimizationRemarkEmitter &ORE, DebugLoc DLoc, const BasicBlock *Block, const Function &Callee, const Function &Caller, const InlineCost &IC, bool ForProfileContext=false, const char *PassName=nullptr)
Emit ORE message based in cost (default heuristic).
std::unique_ptr< InlineAdvisor > getReplayInlineAdvisor(Module &M, FunctionAnalysisManager &FAM, LLVMContext &Context, std::unique_ptr< InlineAdvisor > OriginalAdvisor, const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks, InlineContext IC)
cl::opt< int > SampleHotCallSiteThreshold
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
void updateProfileCallee(Function *Callee, int64_t EntryDelta, const ValueMap< const Value *, WeakTrackingVH > *VMap=nullptr)
Updates profile information by adjusting the entry count by adding EntryDelta then scaling callsite i...
cl::opt< int > SampleColdCallSiteThreshold
InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, bool MergeAttributes=false, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true, Function *ForwardVarArgsTo=nullptr)
This function inlines the called function into the basic block of the caller.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1853
@ DS_Warning
cl::opt< bool > SortProfiledSCC
cl::opt< int > ProfileInlineLimitMax
cl::opt< bool > EnableExtTspBlockPlacement
const uint64_t NOMORE_ICP_MAGICNUM
Magic number in the value profile metadata showing a target has been promoted for the instruction and...
Definition: Metadata.h:57
cl::opt< int > ProfileInlineGrowthLimit
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
Used in the streaming interface as the general argument type.
A wrapper of binary function with basic blocks and jumps.
Provides context on when an inline advisor is constructed in the pipeline (e.g., link phase,...
Definition: InlineAdvisor.h:59
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:205
std::optional< bool > AllowRecursiveCall
Indicate whether we allow inlining for recursive call.
Definition: InlineCost.h:238
std::optional< bool > ComputeFullInlineCost
Compute inline cost even when the cost has exceeded the threshold.
Definition: InlineCost.h:232
Represents the relative location of an instruction.
Definition: SampleProf.h:280