LLVM 17.0.0git
SampleProfile.cpp
Go to the documentation of this file.
1//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the SampleProfileLoader transformation. This pass
10// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
11// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
12// profile information in the given profile.
13//
14// This pass generates branch weight annotations on the IR:
15//
16// - prof: Represents branch weights. This annotation is added to branches
17// to indicate the weights of each edge coming out of the branch.
18// The weight of each edge is the weight of the target block for
19// that edge. The weight of a block B is computed as the maximum
20// number of samples found in B.
21//
22//===----------------------------------------------------------------------===//
23
25#include "llvm/ADT/ArrayRef.h"
26#include "llvm/ADT/DenseMap.h"
27#include "llvm/ADT/DenseSet.h"
28#include "llvm/ADT/MapVector.h"
32#include "llvm/ADT/Statistic.h"
33#include "llvm/ADT/StringMap.h"
34#include "llvm/ADT/StringRef.h"
35#include "llvm/ADT/Twine.h"
46#include "llvm/IR/BasicBlock.h"
47#include "llvm/IR/DebugLoc.h"
49#include "llvm/IR/Function.h"
50#include "llvm/IR/GlobalValue.h"
51#include "llvm/IR/InstrTypes.h"
52#include "llvm/IR/Instruction.h"
55#include "llvm/IR/LLVMContext.h"
56#include "llvm/IR/MDBuilder.h"
57#include "llvm/IR/Module.h"
58#include "llvm/IR/PassManager.h"
59#include "llvm/IR/PseudoProbe.h"
66#include "llvm/Support/Debug.h"
70#include "llvm/Transforms/IPO.h"
80#include <algorithm>
81#include <cassert>
82#include <cstdint>
83#include <functional>
84#include <limits>
85#include <map>
86#include <memory>
87#include <queue>
88#include <string>
89#include <system_error>
90#include <utility>
91#include <vector>
92
93using namespace llvm;
94using namespace sampleprof;
95using namespace llvm::sampleprofutil;
97#define DEBUG_TYPE "sample-profile"
98#define CSINLINE_DEBUG DEBUG_TYPE "-inline"
99
100STATISTIC(NumCSInlined,
101 "Number of functions inlined with context sensitive profile");
102STATISTIC(NumCSNotInlined,
103 "Number of functions not inlined with context sensitive profile");
104STATISTIC(NumMismatchedProfile,
105 "Number of functions with CFG mismatched profile");
106STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
107STATISTIC(NumDuplicatedInlinesite,
108 "Number of inlined callsites with a partial distribution factor");
109
110STATISTIC(NumCSInlinedHitMinLimit,
111 "Number of functions with FDO inline stopped due to min size limit");
112STATISTIC(NumCSInlinedHitMaxLimit,
113 "Number of functions with FDO inline stopped due to max size limit");
115 NumCSInlinedHitGrowthLimit,
116 "Number of functions with FDO inline stopped due to growth size limit");
117
118// Command line option to specify the file to read samples from. This is
119// mainly used for debugging.
121 "sample-profile-file", cl::init(""), cl::value_desc("filename"),
122 cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
123
124// The named file contains a set of transformations that may have been applied
125// to the symbol names between the program from which the sample data was
126// collected and the current program's symbols.
128 "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
129 cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
130
132 "salvage-stale-profile", cl::Hidden, cl::init(false),
133 cl::desc("Salvage stale profile by fuzzy matching and use the remapped "
134 "location for sample profile query."));
135
137 "report-profile-staleness", cl::Hidden, cl::init(false),
138 cl::desc("Compute and report stale profile statistical metrics."));
139
141 "persist-profile-staleness", cl::Hidden, cl::init(false),
142 cl::desc("Compute stale profile statistical metrics and write it into the "
143 "native object file(.llvm_stats section)."));
144
146 "flatten-profile-for-matching", cl::Hidden, cl::init(true),
147 cl::desc(
148 "Use flattened profile for stale profile detection and matching."));
149
151 "profile-sample-accurate", cl::Hidden, cl::init(false),
152 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
153 "callsite and function as having 0 samples. Otherwise, treat "
154 "un-sampled callsites and functions conservatively as unknown. "));
155
157 "profile-sample-block-accurate", cl::Hidden, cl::init(false),
158 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
159 "branches and calls as having 0 samples. Otherwise, treat "
160 "them conservatively as unknown. "));
161
163 "profile-accurate-for-symsinlist", cl::Hidden, cl::init(true),
164 cl::desc("For symbols in profile symbol list, regard their profiles to "
165 "be accurate. It may be overriden by profile-sample-accurate. "));
166
168 "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
169 cl::desc("Merge past inlinee's profile to outline version if sample "
170 "profile loader decided not to inline a call site. It will "
171 "only be enabled when top-down order of profile loading is "
172 "enabled. "));
173
175 "sample-profile-top-down-load", cl::Hidden, cl::init(true),
176 cl::desc("Do profile annotation and inlining for functions in top-down "
177 "order of call graph during sample profile loading. It only "
178 "works for new pass manager. "));
179
180static cl::opt<bool>
181 UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden,
182 cl::desc("Process functions in a top-down order "
183 "defined by the profiled call graph when "
184 "-sample-profile-top-down-load is on."));
185
187 "sample-profile-inline-size", cl::Hidden, cl::init(false),
188 cl::desc("Inline cold call sites in profile loader if it's beneficial "
189 "for code size."));
190
191// Since profiles are consumed by many passes, turning on this option has
192// side effects. For instance, pre-link SCC inliner would see merged profiles
193// and inline the hot functions (that are skipped in this pass).
195 "disable-sample-loader-inlining", cl::Hidden, cl::init(false),
196 cl::desc("If true, artifically skip inline transformation in sample-loader "
197 "pass, and merge (or scale) profiles (as configured by "
198 "--sample-profile-merge-inlinee)."));
199
200namespace llvm {
202 SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,
203 cl::desc("Sort profiled recursion by edge weights."));
204
206 "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
207 cl::desc("The size growth ratio limit for proirity-based sample profile "
208 "loader inlining."));
209
211 "sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
212 cl::desc("The lower bound of size growth limit for "
213 "proirity-based sample profile loader inlining."));
214
216 "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
217 cl::desc("The upper bound of size growth limit for "
218 "proirity-based sample profile loader inlining."));
219
221 "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
222 cl::desc("Hot callsite threshold for proirity-based sample profile loader "
223 "inlining."));
224
226 "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
227 cl::desc("Threshold for inlining cold callsites"));
228} // namespace llvm
229
231 "sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25),
232 cl::desc(
233 "Relative hotness percentage threshold for indirect "
234 "call promotion in proirity-based sample profile loader inlining."));
235
237 "sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1),
238 cl::desc(
239 "Skip relative hotness check for ICP up to given number of targets."));
240
242 "sample-profile-prioritized-inline", cl::Hidden,
243
244 cl::desc("Use call site prioritized inlining for sample profile loader."
245 "Currently only CSSPGO is supported."));
246
248 "sample-profile-use-preinliner", cl::Hidden,
249
250 cl::desc("Use the preinliner decisions stored in profile context."));
251
253 "sample-profile-recursive-inline", cl::Hidden,
254
255 cl::desc("Allow sample loader inliner to inline recursive calls."));
256
258 "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
259 cl::desc(
260 "Optimization remarks file containing inline remarks to be replayed "
261 "by inlining from sample profile loader."),
262 cl::Hidden);
263
265 "sample-profile-inline-replay-scope",
266 cl::init(ReplayInlinerSettings::Scope::Function),
267 cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function",
268 "Replay on functions that have remarks associated "
269 "with them (default)"),
270 clEnumValN(ReplayInlinerSettings::Scope::Module, "Module",
271 "Replay on the entire module")),
272 cl::desc("Whether inline replay should be applied to the entire "
273 "Module or just the Functions (default) that are present as "
274 "callers in remarks during sample profile inlining."),
275 cl::Hidden);
276
278 "sample-profile-inline-replay-fallback",
279 cl::init(ReplayInlinerSettings::Fallback::Original),
282 ReplayInlinerSettings::Fallback::Original, "Original",
283 "All decisions not in replay send to original advisor (default)"),
284 clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline,
285 "AlwaysInline", "All decisions not in replay are inlined"),
286 clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline",
287 "All decisions not in replay are not inlined")),
288 cl::desc("How sample profile inline replay treats sites that don't come "
289 "from the replay. Original: defers to original advisor, "
290 "AlwaysInline: inline all sites not in replay, NeverInline: "
291 "inline no sites not in replay"),
292 cl::Hidden);
293
295 "sample-profile-inline-replay-format",
296 cl::init(CallSiteFormat::Format::LineColumnDiscriminator),
298 clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"),
299 clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn",
300 "<Line Number>:<Column Number>"),
301 clEnumValN(CallSiteFormat::Format::LineDiscriminator,
302 "LineDiscriminator", "<Line Number>.<Discriminator>"),
303 clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator,
304 "LineColumnDiscriminator",
305 "<Line Number>:<Column Number>.<Discriminator> (default)")),
306 cl::desc("How sample profile inline replay file is formatted"), cl::Hidden);
307
309 MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden,
310 cl::desc("Max number of promotions for a single indirect "
311 "call callsite in sample profile loader"));
312
314 "overwrite-existing-weights", cl::Hidden, cl::init(false),
315 cl::desc("Ignore existing branch weights on IR and always overwrite."));
316
318 "annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false),
319 cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for "
320 "sample-profile inline pass name."));
321
322namespace llvm {
324}
325
326namespace {
327
328using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
329using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
330using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
331using EdgeWeightMap = DenseMap<Edge, uint64_t>;
332using BlockEdgeMap =
334
335class GUIDToFuncNameMapper {
336public:
337 GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
338 DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
339 : CurrentReader(Reader), CurrentModule(M),
340 CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
341 if (!CurrentReader.useMD5())
342 return;
343
344 for (const auto &F : CurrentModule) {
345 StringRef OrigName = F.getName();
346 CurrentGUIDToFuncNameMap.insert(
347 {Function::getGUID(OrigName), OrigName});
348
349 // Local to global var promotion used by optimization like thinlto
350 // will rename the var and add suffix like ".llvm.xxx" to the
351 // original local name. In sample profile, the suffixes of function
352 // names are all stripped. Since it is possible that the mapper is
353 // built in post-thin-link phase and var promotion has been done,
354 // we need to add the substring of function name without the suffix
355 // into the GUIDToFuncNameMap.
357 if (CanonName != OrigName)
358 CurrentGUIDToFuncNameMap.insert(
359 {Function::getGUID(CanonName), CanonName});
360 }
361
362 // Update GUIDToFuncNameMap for each function including inlinees.
363 SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
364 }
365
366 ~GUIDToFuncNameMapper() {
367 if (!CurrentReader.useMD5())
368 return;
369
370 CurrentGUIDToFuncNameMap.clear();
371
372 // Reset GUIDToFuncNameMap for of each function as they're no
373 // longer valid at this point.
374 SetGUIDToFuncNameMapForAll(nullptr);
375 }
376
377private:
378 void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
379 std::queue<FunctionSamples *> FSToUpdate;
380 for (auto &IFS : CurrentReader.getProfiles()) {
381 FSToUpdate.push(&IFS.second);
382 }
383
384 while (!FSToUpdate.empty()) {
385 FunctionSamples *FS = FSToUpdate.front();
386 FSToUpdate.pop();
387 FS->GUIDToFuncNameMap = Map;
388 for (const auto &ICS : FS->getCallsiteSamples()) {
389 const FunctionSamplesMap &FSMap = ICS.second;
390 for (const auto &IFS : FSMap) {
391 FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
392 FSToUpdate.push(&FS);
393 }
394 }
395 }
396 }
397
399 Module &CurrentModule;
400 DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
401};
402
403// Inline candidate used by iterative callsite prioritized inliner
404struct InlineCandidate {
405 CallBase *CallInstr;
406 const FunctionSamples *CalleeSamples;
407 // Prorated callsite count, which will be used to guide inlining. For example,
408 // if a callsite is duplicated in LTO prelink, then in LTO postlink the two
409 // copies will get their own distribution factors and their prorated counts
410 // will be used to decide if they should be inlined independently.
411 uint64_t CallsiteCount;
412 // Call site distribution factor to prorate the profile samples for a
413 // duplicated callsite. Default value is 1.0.
414 float CallsiteDistribution;
415};
416
417// Inline candidate comparer using call site weight
418struct CandidateComparer {
419 bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {
420 if (LHS.CallsiteCount != RHS.CallsiteCount)
421 return LHS.CallsiteCount < RHS.CallsiteCount;
422
423 const FunctionSamples *LCS = LHS.CalleeSamples;
424 const FunctionSamples *RCS = RHS.CalleeSamples;
425 assert(LCS && RCS && "Expect non-null FunctionSamples");
426
427 // Tie breaker using number of samples try to favor smaller functions first
428 if (LCS->getBodySamples().size() != RCS->getBodySamples().size())
429 return LCS->getBodySamples().size() > RCS->getBodySamples().size();
430
431 // Tie breaker using GUID so we have stable/deterministic inlining order
432 return LCS->getGUID(LCS->getName()) < RCS->getGUID(RCS->getName());
433 }
434};
435
436using CandidateQueue =
438 CandidateComparer>;
439
440// Sample profile matching - fuzzy match.
441class SampleProfileMatcher {
442 Module &M;
443 SampleProfileReader &Reader;
444 const PseudoProbeManager *ProbeManager;
445 SampleProfileMap FlattenedProfiles;
446 // For each function, the matcher generates a map, of which each entry is a
447 // mapping from the source location of current build to the source location in
448 // the profile.
449 StringMap<LocToLocMap> FuncMappings;
450
451 // Profile mismatching statstics.
452 uint64_t TotalProfiledCallsites = 0;
453 uint64_t NumMismatchedCallsites = 0;
454 uint64_t MismatchedCallsiteSamples = 0;
455 uint64_t TotalCallsiteSamples = 0;
456 uint64_t TotalProfiledFunc = 0;
457 uint64_t NumMismatchedFuncHash = 0;
458 uint64_t MismatchedFuncHashSamples = 0;
459 uint64_t TotalFuncHashSamples = 0;
460
461public:
462 SampleProfileMatcher(Module &M, SampleProfileReader &Reader,
463 const PseudoProbeManager *ProbeManager)
464 : M(M), Reader(Reader), ProbeManager(ProbeManager) {
466 ProfileConverter::flattenProfile(Reader.getProfiles(), FlattenedProfiles,
468 }
469 }
470 void runOnModule();
471
472private:
473 FunctionSamples *getFlattenedSamplesFor(const Function &F) {
475 auto It = FlattenedProfiles.find(CanonFName);
476 if (It != FlattenedProfiles.end())
477 return &It->second;
478 return nullptr;
479 }
480 void runOnFunction(const Function &F, const FunctionSamples &FS);
481 void countProfileMismatches(
482 const FunctionSamples &FS,
483 const std::unordered_set<LineLocation, LineLocationHash>
484 &MatchedCallsiteLocs,
485 uint64_t &FuncMismatchedCallsites, uint64_t &FuncProfiledCallsites);
486
487 LocToLocMap &getIRToProfileLocationMap(const Function &F) {
488 auto Ret = FuncMappings.try_emplace(
490 return Ret.first->second;
491 }
492 void distributeIRToProfileLocationMap();
493 void distributeIRToProfileLocationMap(FunctionSamples &FS);
494 void populateProfileCallsites(
495 const FunctionSamples &FS,
496 StringMap<std::set<LineLocation>> &CalleeToCallsitesMap);
497 void runStaleProfileMatching(
498 const std::map<LineLocation, StringRef> &IRLocations,
499 StringMap<std::set<LineLocation>> &CalleeToCallsitesMap,
500 LocToLocMap &IRToProfileLocationMap);
501};
502
503/// Sample profile pass.
504///
505/// This pass reads profile data from the file specified by
506/// -sample-profile-file and annotates every affected function with the
507/// profile information found in that file.
508class SampleProfileLoader final
509 : public SampleProfileLoaderBaseImpl<BasicBlock> {
510public:
511 SampleProfileLoader(
512 StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase,
514 std::function<AssumptionCache &(Function &)> GetAssumptionCache,
515 std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
516 std::function<const TargetLibraryInfo &(Function &)> GetTLI)
518 std::move(FS)),
519 GetAC(std::move(GetAssumptionCache)),
520 GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
521 LTOPhase(LTOPhase),
522 AnnotatedPassName(AnnotateSampleProfileInlinePhase
525 : CSINLINE_DEBUG) {}
526
527 bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
528 bool runOnModule(Module &M, ModuleAnalysisManager *AM,
530
531protected:
533 bool emitAnnotations(Function &F);
535 const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
536 const FunctionSamples *
537 findFunctionSamples(const Instruction &I) const override;
538 std::vector<const FunctionSamples *>
539 findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
540 void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples,
541 DenseSet<GlobalValue::GUID> &InlinedGUIDs,
542 const StringMap<Function *> &SymbolMap,
543 uint64_t Threshold);
544 // Attempt to promote indirect call and also inline the promoted call
545 bool tryPromoteAndInlineCandidate(
546 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
547 uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
548
549 bool inlineHotFunctions(Function &F,
550 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
551 std::optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB);
552 bool getExternalInlineAdvisorShouldInline(CallBase &CB);
553 InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
554 bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
555 bool
556 tryInlineCandidate(InlineCandidate &Candidate,
557 SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
558 bool
559 inlineHotFunctionsWithPriority(Function &F,
560 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
561 // Inline cold/small functions in addition to hot ones
562 bool shouldInlineColdCallee(CallBase &CallInst);
563 void emitOptimizationRemarksForInlineCandidates(
564 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
565 bool Hot);
566 void promoteMergeNotInlinedContextSamples(
568 const Function &F);
569 std::vector<Function *> buildFunctionOrder(Module &M, LazyCallGraph &CG);
570 std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(Module &M);
571 void generateMDProfMetadata(Function &F);
572
573 /// Map from function name to Function *. Used to find the function from
574 /// the function name. If the function name contains suffix, additional
575 /// entry is added to map from the stripped name to the function if there
576 /// is one-to-one mapping.
578
579 std::function<AssumptionCache &(Function &)> GetAC;
580 std::function<TargetTransformInfo &(Function &)> GetTTI;
581 std::function<const TargetLibraryInfo &(Function &)> GetTLI;
582
583 /// Profile tracker for different context.
584 std::unique_ptr<SampleContextTracker> ContextTracker;
585
586 /// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
587 ///
588 /// We need to know the LTO phase because for example in ThinLTOPrelink
589 /// phase, in annotation, we should not promote indirect calls. Instead,
590 /// we will mark GUIDs that needs to be annotated to the function.
591 const ThinOrFullLTOPhase LTOPhase;
592 const std::string AnnotatedPassName;
593
594 /// Profle Symbol list tells whether a function name appears in the binary
595 /// used to generate the current profile.
596 std::unique_ptr<ProfileSymbolList> PSL;
597
598 /// Total number of samples collected in this profile.
599 ///
600 /// This is the sum of all the samples collected in all the functions executed
601 /// at runtime.
602 uint64_t TotalCollectedSamples = 0;
603
604 // Information recorded when we declined to inline a call site
605 // because we have determined it is too cold is accumulated for
606 // each callee function. Initially this is just the entry count.
607 struct NotInlinedProfileInfo {
608 uint64_t entryCount;
609 };
611
612 // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
613 // all the function symbols defined or declared in current module.
614 DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
615
616 // All the Names used in FunctionSamples including outline function
617 // names, inline instance names and call target names.
618 StringSet<> NamesInProfile;
619
620 // For symbol in profile symbol list, whether to regard their profiles
621 // to be accurate. It is mainly decided by existance of profile symbol
622 // list and -profile-accurate-for-symsinlist flag, but it can be
623 // overriden by -profile-sample-accurate or profile-sample-accurate
624 // attribute.
625 bool ProfAccForSymsInList;
626
627 // External inline advisor used to replay inline decision from remarks.
628 std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor;
629
630 // A helper to implement the sample profile matching algorithm.
631 std::unique_ptr<SampleProfileMatcher> MatchingManager;
632
633private:
634 const char *getAnnotatedRemarkPassName() const {
635 return AnnotatedPassName.c_str();
636 }
637};
638} // end anonymous namespace
639
640ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
642 return getProbeWeight(Inst);
643
644 const DebugLoc &DLoc = Inst.getDebugLoc();
645 if (!DLoc)
646 return std::error_code();
647
648 // Ignore all intrinsics, phinodes and branch instructions.
649 // Branch and phinodes instruction usually contains debug info from sources
650 // outside of the residing basic block, thus we ignore them during annotation.
651 if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
652 return std::error_code();
653
654 // For non-CS profile, if a direct call/invoke instruction is inlined in
655 // profile (findCalleeFunctionSamples returns non-empty result), but not
656 // inlined here, it means that the inlined callsite has no sample, thus the
657 // call instruction should have 0 count.
658 // For CS profile, the callsite count of previously inlined callees is
659 // populated with the entry count of the callees.
661 if (const auto *CB = dyn_cast<CallBase>(&Inst))
662 if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
663 return 0;
664
665 return getInstWeightImpl(Inst);
666}
667
668/// Get the FunctionSamples for a call instruction.
669///
670/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
671/// instance in which that call instruction is calling to. It contains
672/// all samples that resides in the inlined instance. We first find the
673/// inlined instance in which the call instruction is from, then we
674/// traverse its children to find the callsite with the matching
675/// location.
676///
677/// \param Inst Call/Invoke instruction to query.
678///
679/// \returns The FunctionSamples pointer to the inlined instance.
680const FunctionSamples *
681SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
682 const DILocation *DIL = Inst.getDebugLoc();
683 if (!DIL) {
684 return nullptr;
685 }
686
687 StringRef CalleeName;
688 if (Function *Callee = Inst.getCalledFunction())
689 CalleeName = Callee->getName();
690
692 return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
693
694 const FunctionSamples *FS = findFunctionSamples(Inst);
695 if (FS == nullptr)
696 return nullptr;
697
698 return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL),
699 CalleeName, Reader->getRemapper());
700}
701
702/// Returns a vector of FunctionSamples that are the indirect call targets
703/// of \p Inst. The vector is sorted by the total number of samples. Stores
704/// the total call count of the indirect call in \p Sum.
705std::vector<const FunctionSamples *>
706SampleProfileLoader::findIndirectCallFunctionSamples(
707 const Instruction &Inst, uint64_t &Sum) const {
708 const DILocation *DIL = Inst.getDebugLoc();
709 std::vector<const FunctionSamples *> R;
710
711 if (!DIL) {
712 return R;
713 }
714
715 auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
716 assert(L && R && "Expect non-null FunctionSamples");
717 if (L->getHeadSamplesEstimate() != R->getHeadSamplesEstimate())
718 return L->getHeadSamplesEstimate() > R->getHeadSamplesEstimate();
719 return FunctionSamples::getGUID(L->getName()) <
720 FunctionSamples::getGUID(R->getName());
721 };
722
724 auto CalleeSamples =
725 ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
726 if (CalleeSamples.empty())
727 return R;
728
729 // For CSSPGO, we only use target context profile's entry count
730 // as that already includes both inlined callee and non-inlined ones..
731 Sum = 0;
732 for (const auto *const FS : CalleeSamples) {
733 Sum += FS->getHeadSamplesEstimate();
734 R.push_back(FS);
735 }
736 llvm::sort(R, FSCompare);
737 return R;
738 }
739
740 const FunctionSamples *FS = findFunctionSamples(Inst);
741 if (FS == nullptr)
742 return R;
743
745 auto T = FS->findCallTargetMapAt(CallSite);
746 Sum = 0;
747 if (T)
748 for (const auto &T_C : T.get())
749 Sum += T_C.second;
750 if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
751 if (M->empty())
752 return R;
753 for (const auto &NameFS : *M) {
754 Sum += NameFS.second.getHeadSamplesEstimate();
755 R.push_back(&NameFS.second);
756 }
757 llvm::sort(R, FSCompare);
758 }
759 return R;
760}
761
762const FunctionSamples *
763SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
765 std::optional<PseudoProbe> Probe = extractProbe(Inst);
766 if (!Probe)
767 return nullptr;
768 }
769
770 const DILocation *DIL = Inst.getDebugLoc();
771 if (!DIL)
772 return Samples;
773
774 auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
775 if (it.second) {
777 it.first->second = ContextTracker->getContextSamplesFor(DIL);
778 else
779 it.first->second =
780 Samples->findFunctionSamples(DIL, Reader->getRemapper());
781 }
782 return it.first->second;
783}
784
785/// Check whether the indirect call promotion history of \p Inst allows
786/// the promotion for \p Candidate.
787/// If the profile count for the promotion candidate \p Candidate is
788/// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted
789/// for \p Inst. If we already have at least MaxNumPromotions
790/// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we
791/// cannot promote for \p Inst anymore.
792static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) {
793 uint32_t NumVals = 0;
794 uint64_t TotalCount = 0;
795 std::unique_ptr<InstrProfValueData[]> ValueData =
796 std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
797 bool Valid =
798 getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
799 ValueData.get(), NumVals, TotalCount, true);
800 // No valid value profile so no promoted targets have been recorded
801 // before. Ok to do ICP.
802 if (!Valid)
803 return true;
804
805 unsigned NumPromoted = 0;
806 for (uint32_t I = 0; I < NumVals; I++) {
807 if (ValueData[I].Count != NOMORE_ICP_MAGICNUM)
808 continue;
809
810 // If the promotion candidate has NOMORE_ICP_MAGICNUM count in the
811 // metadata, it means the candidate has been promoted for this
812 // indirect call.
813 if (ValueData[I].Value == Function::getGUID(Candidate))
814 return false;
815 NumPromoted++;
816 // If already have MaxNumPromotions promotion, don't do it anymore.
817 if (NumPromoted == MaxNumPromotions)
818 return false;
819 }
820 return true;
821}
822
823/// Update indirect call target profile metadata for \p Inst.
824/// Usually \p Sum is the sum of counts of all the targets for \p Inst.
825/// If it is 0, it means updateIDTMetaData is used to mark a
826/// certain target to be promoted already. If it is not zero,
827/// we expect to use it to update the total count in the value profile.
828static void
830 const SmallVectorImpl<InstrProfValueData> &CallTargets,
831 uint64_t Sum) {
832 // Bail out early if MaxNumPromotions is zero.
833 // This prevents allocating an array of zero length below.
834 //
835 // Note `updateIDTMetaData` is called in two places so check
836 // `MaxNumPromotions` inside it.
837 if (MaxNumPromotions == 0)
838 return;
839 uint32_t NumVals = 0;
840 // OldSum is the existing total count in the value profile data.
841 uint64_t OldSum = 0;
842 std::unique_ptr<InstrProfValueData[]> ValueData =
843 std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
844 bool Valid =
845 getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
846 ValueData.get(), NumVals, OldSum, true);
847
848 DenseMap<uint64_t, uint64_t> ValueCountMap;
849 if (Sum == 0) {
850 assert((CallTargets.size() == 1 &&
851 CallTargets[0].Count == NOMORE_ICP_MAGICNUM) &&
852 "If sum is 0, assume only one element in CallTargets "
853 "with count being NOMORE_ICP_MAGICNUM");
854 // Initialize ValueCountMap with existing value profile data.
855 if (Valid) {
856 for (uint32_t I = 0; I < NumVals; I++)
857 ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
858 }
859 auto Pair =
860 ValueCountMap.try_emplace(CallTargets[0].Value, CallTargets[0].Count);
861 // If the target already exists in value profile, decrease the total
862 // count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM.
863 if (!Pair.second) {
864 OldSum -= Pair.first->second;
865 Pair.first->second = NOMORE_ICP_MAGICNUM;
866 }
867 Sum = OldSum;
868 } else {
869 // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM
870 // counts in the value profile.
871 if (Valid) {
872 for (uint32_t I = 0; I < NumVals; I++) {
873 if (ValueData[I].Count == NOMORE_ICP_MAGICNUM)
874 ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
875 }
876 }
877
878 for (const auto &Data : CallTargets) {
879 auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count);
880 if (Pair.second)
881 continue;
882 // The target represented by Data.Value has already been promoted.
883 // Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease
884 // Sum by Data.Count.
885 assert(Sum >= Data.Count && "Sum should never be less than Data.Count");
886 Sum -= Data.Count;
887 }
888 }
889
891 for (const auto &ValueCount : ValueCountMap) {
892 NewCallTargets.emplace_back(
893 InstrProfValueData{ValueCount.first, ValueCount.second});
894 }
895
896 llvm::sort(NewCallTargets,
897 [](const InstrProfValueData &L, const InstrProfValueData &R) {
898 if (L.Count != R.Count)
899 return L.Count > R.Count;
900 return L.Value > R.Value;
901 });
902
903 uint32_t MaxMDCount =
904 std::min(NewCallTargets.size(), static_cast<size_t>(MaxNumPromotions));
906 NewCallTargets, Sum, IPVK_IndirectCallTarget, MaxMDCount);
907}
908
909/// Attempt to promote indirect call and also inline the promoted call.
910///
911/// \param F Caller function.
912/// \param Candidate ICP and inline candidate.
913/// \param SumOrigin Original sum of target counts for indirect call before
914/// promoting given candidate.
915/// \param Sum Prorated sum of remaining target counts for indirect call
916/// after promoting given candidate.
917/// \param InlinedCallSite Output vector for new call sites exposed after
918/// inlining.
919bool SampleProfileLoader::tryPromoteAndInlineCandidate(
920 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
921 SmallVector<CallBase *, 8> *InlinedCallSite) {
922 // Bail out early if sample-loader inliner is disabled.
924 return false;
925
926 // Bail out early if MaxNumPromotions is zero.
927 // This prevents allocating an array of zero length in callees below.
928 if (MaxNumPromotions == 0)
929 return false;
930 auto CalleeFunctionName = Candidate.CalleeSamples->getFuncName();
931 auto R = SymbolMap.find(CalleeFunctionName);
932 if (R == SymbolMap.end() || !R->getValue())
933 return false;
934
935 auto &CI = *Candidate.CallInstr;
936 if (!doesHistoryAllowICP(CI, R->getValue()->getName()))
937 return false;
938
939 const char *Reason = "Callee function not available";
940 // R->getValue() != &F is to prevent promoting a recursive call.
941 // If it is a recursive call, we do not inline it as it could bloat
942 // the code exponentially. There is way to better handle this, e.g.
943 // clone the caller first, and inline the cloned caller if it is
944 // recursive. As llvm does not inline recursive calls, we will
945 // simply ignore it instead of handling it explicitly.
946 if (!R->getValue()->isDeclaration() && R->getValue()->getSubprogram() &&
947 R->getValue()->hasFnAttribute("use-sample-profile") &&
948 R->getValue() != &F && isLegalToPromote(CI, R->getValue(), &Reason)) {
949 // For promoted target, set its value with NOMORE_ICP_MAGICNUM count
950 // in the value profile metadata so the target won't be promoted again.
951 SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{
952 Function::getGUID(R->getValue()->getName()), NOMORE_ICP_MAGICNUM}};
953 updateIDTMetaData(CI, SortedCallTargets, 0);
954
955 auto *DI = &pgo::promoteIndirectCall(
956 CI, R->getValue(), Candidate.CallsiteCount, Sum, false, ORE);
957 if (DI) {
958 Sum -= Candidate.CallsiteCount;
959 // Do not prorate the indirect callsite distribution since the original
960 // distribution will be used to scale down non-promoted profile target
961 // counts later. By doing this we lose track of the real callsite count
962 // for the leftover indirect callsite as a trade off for accurate call
963 // target counts.
964 // TODO: Ideally we would have two separate factors, one for call site
965 // counts and one is used to prorate call target counts.
966 // Do not update the promoted direct callsite distribution at this
967 // point since the original distribution combined with the callee profile
968 // will be used to prorate callsites from the callee if inlined. Once not
969 // inlined, the direct callsite distribution should be prorated so that
970 // the it will reflect the real callsite counts.
971 Candidate.CallInstr = DI;
972 if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {
973 bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);
974 if (!Inlined) {
975 // Prorate the direct callsite distribution so that it reflects real
976 // callsite counts.
978 *DI, static_cast<float>(Candidate.CallsiteCount) / SumOrigin);
979 }
980 return Inlined;
981 }
982 }
983 } else {
984 LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
985 << Candidate.CalleeSamples->getFuncName() << " because "
986 << Reason << "\n");
987 }
988 return false;
989}
990
991bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
993 return false;
994
996 if (Callee == nullptr)
997 return false;
998
1000 GetAC, GetTLI);
1001
1002 if (Cost.isNever())
1003 return false;
1004
1005 if (Cost.isAlways())
1006 return true;
1007
1008 return Cost.getCost() <= SampleColdCallSiteThreshold;
1009}
1010
1011void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
1012 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
1013 bool Hot) {
1014 for (auto *I : Candidates) {
1015 Function *CalledFunction = I->getCalledFunction();
1016 if (CalledFunction) {
1017 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1018 "InlineAttempt", I->getDebugLoc(),
1019 I->getParent())
1020 << "previous inlining reattempted for "
1021 << (Hot ? "hotness: '" : "size: '")
1022 << ore::NV("Callee", CalledFunction) << "' into '"
1023 << ore::NV("Caller", &F) << "'");
1024 }
1025 }
1026}
1027
1028void SampleProfileLoader::findExternalInlineCandidate(
1029 CallBase *CB, const FunctionSamples *Samples,
1030 DenseSet<GlobalValue::GUID> &InlinedGUIDs,
1031 const StringMap<Function *> &SymbolMap, uint64_t Threshold) {
1032
1033 // If ExternalInlineAdvisor wants to inline an external function
1034 // make sure it's imported
1035 if (CB && getExternalInlineAdvisorShouldInline(*CB)) {
1036 // Samples may not exist for replayed function, if so
1037 // just add the direct GUID and move on
1038 if (!Samples) {
1039 InlinedGUIDs.insert(
1041 return;
1042 }
1043 // Otherwise, drop the threshold to import everything that we can
1044 Threshold = 0;
1045 }
1046
1047 assert(Samples && "expect non-null caller profile");
1048
1049 // For AutoFDO profile, retrieve candidate profiles by walking over
1050 // the nested inlinee profiles.
1052 Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);
1053 return;
1054 }
1055
1056 ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(Samples);
1057 std::queue<ContextTrieNode *> CalleeList;
1058 CalleeList.push(Caller);
1059 while (!CalleeList.empty()) {
1060 ContextTrieNode *Node = CalleeList.front();
1061 CalleeList.pop();
1062 FunctionSamples *CalleeSample = Node->getFunctionSamples();
1063 // For CSSPGO profile, retrieve candidate profile by walking over the
1064 // trie built for context profile. Note that also take call targets
1065 // even if callee doesn't have a corresponding context profile.
1066 if (!CalleeSample)
1067 continue;
1068
1069 // If pre-inliner decision is used, honor that for importing as well.
1070 bool PreInline =
1073 if (!PreInline && CalleeSample->getHeadSamplesEstimate() < Threshold)
1074 continue;
1075
1076 StringRef Name = CalleeSample->getFuncName();
1078 // Add to the import list only when it's defined out of module.
1079 if (!Func || Func->isDeclaration())
1080 InlinedGUIDs.insert(FunctionSamples::getGUID(CalleeSample->getName()));
1081
1082 // Import hot CallTargets, which may not be available in IR because full
1083 // profile annotation cannot be done until backend compilation in ThinLTO.
1084 for (const auto &BS : CalleeSample->getBodySamples())
1085 for (const auto &TS : BS.second.getCallTargets())
1086 if (TS.getValue() > Threshold) {
1087 StringRef CalleeName = CalleeSample->getFuncName(TS.getKey());
1088 const Function *Callee = SymbolMap.lookup(CalleeName);
1089 if (!Callee || Callee->isDeclaration())
1090 InlinedGUIDs.insert(FunctionSamples::getGUID(TS.getKey()));
1091 }
1092
1093 // Import hot child context profile associted with callees. Note that this
1094 // may have some overlap with the call target loop above, but doing this
1095 // based child context profile again effectively allow us to use the max of
1096 // entry count and call target count to determine importing.
1097 for (auto &Child : Node->getAllChildContext()) {
1098 ContextTrieNode *CalleeNode = &Child.second;
1099 CalleeList.push(CalleeNode);
1100 }
1101 }
1102}
1103
1104/// Iteratively inline hot callsites of a function.
1105///
1106/// Iteratively traverse all callsites of the function \p F, so as to
1107/// find out callsites with corresponding inline instances.
1108///
1109/// For such callsites,
1110/// - If it is hot enough, inline the callsites and adds callsites of the callee
1111/// into the caller. If the call is an indirect call, first promote
1112/// it to direct call. Each indirect call is limited with a single target.
1113///
1114/// - If a callsite is not inlined, merge the its profile to the outline
1115/// version (if --sample-profile-merge-inlinee is true), or scale the
1116/// counters of standalone function based on the profile of inlined
1117/// instances (if --sample-profile-merge-inlinee is false).
1118///
1119/// Later passes may consume the updated profiles.
1120///
1121/// \param F function to perform iterative inlining.
1122/// \param InlinedGUIDs a set to be updated to include all GUIDs that are
1123/// inlined in the profiled binary.
1124///
1125/// \returns True if there is any inline happened.
1126bool SampleProfileLoader::inlineHotFunctions(
1127 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1128 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1129 // Profile symbol list is ignored when profile-sample-accurate is on.
1130 assert((!ProfAccForSymsInList ||
1132 !F.hasFnAttribute("profile-sample-accurate"))) &&
1133 "ProfAccForSymsInList should be false when profile-sample-accurate "
1134 "is enabled");
1135
1136 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1137 bool Changed = false;
1138 bool LocalChanged = true;
1139 while (LocalChanged) {
1140 LocalChanged = false;
1142 for (auto &BB : F) {
1143 bool Hot = false;
1144 SmallVector<CallBase *, 10> AllCandidates;
1145 SmallVector<CallBase *, 10> ColdCandidates;
1146 for (auto &I : BB) {
1147 const FunctionSamples *FS = nullptr;
1148 if (auto *CB = dyn_cast<CallBase>(&I)) {
1149 if (!isa<IntrinsicInst>(I)) {
1150 if ((FS = findCalleeFunctionSamples(*CB))) {
1151 assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
1152 "GUIDToFuncNameMap has to be populated");
1153 AllCandidates.push_back(CB);
1154 if (FS->getHeadSamplesEstimate() > 0 ||
1156 LocalNotInlinedCallSites.insert({CB, FS});
1157 if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1158 Hot = true;
1159 else if (shouldInlineColdCallee(*CB))
1160 ColdCandidates.push_back(CB);
1161 } else if (getExternalInlineAdvisorShouldInline(*CB)) {
1162 AllCandidates.push_back(CB);
1163 }
1164 }
1165 }
1166 }
1167 if (Hot || ExternalInlineAdvisor) {
1168 CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
1169 emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
1170 } else {
1171 CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
1172 emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
1173 }
1174 }
1175 for (CallBase *I : CIS) {
1176 Function *CalledFunction = I->getCalledFunction();
1177 InlineCandidate Candidate = {I, LocalNotInlinedCallSites.lookup(I),
1178 0 /* dummy count */,
1179 1.0 /* dummy distribution factor */};
1180 // Do not inline recursive calls.
1181 if (CalledFunction == &F)
1182 continue;
1183 if (I->isIndirectCall()) {
1184 uint64_t Sum;
1185 for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
1186 uint64_t SumOrigin = Sum;
1187 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1188 findExternalInlineCandidate(I, FS, InlinedGUIDs, SymbolMap,
1189 PSI->getOrCompHotCountThreshold());
1190 continue;
1191 }
1192 if (!callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1193 continue;
1194
1195 Candidate = {I, FS, FS->getHeadSamplesEstimate(), 1.0};
1196 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) {
1197 LocalNotInlinedCallSites.erase(I);
1198 LocalChanged = true;
1199 }
1200 }
1201 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1202 !CalledFunction->isDeclaration()) {
1203 if (tryInlineCandidate(Candidate)) {
1204 LocalNotInlinedCallSites.erase(I);
1205 LocalChanged = true;
1206 }
1207 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1208 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1209 InlinedGUIDs, SymbolMap,
1210 PSI->getOrCompHotCountThreshold());
1211 }
1212 }
1213 Changed |= LocalChanged;
1214 }
1215
1216 // For CS profile, profile for not inlined context will be merged when
1217 // base profile is being retrieved.
1219 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1220 return Changed;
1221}
1222
1223bool SampleProfileLoader::tryInlineCandidate(
1224 InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
1225 // Do not attempt to inline a candidate if
1226 // --disable-sample-loader-inlining is true.
1228 return false;
1229
1230 CallBase &CB = *Candidate.CallInstr;
1231 Function *CalledFunction = CB.getCalledFunction();
1232 assert(CalledFunction && "Expect a callee with definition");
1233 DebugLoc DLoc = CB.getDebugLoc();
1234 BasicBlock *BB = CB.getParent();
1235
1236 InlineCost Cost = shouldInlineCandidate(Candidate);
1237 if (Cost.isNever()) {
1238 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1239 "InlineFail", DLoc, BB)
1240 << "incompatible inlining");
1241 return false;
1242 }
1243
1244 if (!Cost)
1245 return false;
1246
1247 InlineFunctionInfo IFI(GetAC);
1248 IFI.UpdateProfile = false;
1249 InlineResult IR = InlineFunction(CB, IFI,
1250 /*MergeAttributes=*/true);
1251 if (!IR.isSuccess())
1252 return false;
1253
1254 // The call to InlineFunction erases I, so we can't pass it here.
1255 emitInlinedIntoBasedOnCost(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(),
1256 Cost, true, getAnnotatedRemarkPassName());
1257
1258 // Now populate the list of newly exposed call sites.
1259 if (InlinedCallSites) {
1260 InlinedCallSites->clear();
1261 for (auto &I : IFI.InlinedCallSites)
1262 InlinedCallSites->push_back(I);
1263 }
1264
1266 ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
1267 ++NumCSInlined;
1268
1269 // Prorate inlined probes for a duplicated inlining callsite which probably
1270 // has a distribution less than 100%. Samples for an inlinee should be
1271 // distributed among the copies of the original callsite based on each
1272 // callsite's distribution factor for counts accuracy. Note that an inlined
1273 // probe may come with its own distribution factor if it has been duplicated
1274 // in the inlinee body. The two factor are multiplied to reflect the
1275 // aggregation of duplication.
1276 if (Candidate.CallsiteDistribution < 1) {
1277 for (auto &I : IFI.InlinedCallSites) {
1278 if (std::optional<PseudoProbe> Probe = extractProbe(*I))
1279 setProbeDistributionFactor(*I, Probe->Factor *
1280 Candidate.CallsiteDistribution);
1281 }
1282 NumDuplicatedInlinesite++;
1283 }
1284
1285 return true;
1286}
1287
1288bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
1289 CallBase *CB) {
1290 assert(CB && "Expect non-null call instruction");
1291
1292 if (isa<IntrinsicInst>(CB))
1293 return false;
1294
1295 // Find the callee's profile. For indirect call, find hottest target profile.
1296 const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
1297 // If ExternalInlineAdvisor wants to inline this site, do so even
1298 // if Samples are not present.
1299 if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(*CB))
1300 return false;
1301
1302 float Factor = 1.0;
1303 if (std::optional<PseudoProbe> Probe = extractProbe(*CB))
1304 Factor = Probe->Factor;
1305
1306 uint64_t CallsiteCount =
1307 CalleeSamples ? CalleeSamples->getHeadSamplesEstimate() * Factor : 0;
1308 *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
1309 return true;
1310}
1311
1312std::optional<InlineCost>
1313SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) {
1314 std::unique_ptr<InlineAdvice> Advice = nullptr;
1315 if (ExternalInlineAdvisor) {
1316 Advice = ExternalInlineAdvisor->getAdvice(CB);
1317 if (Advice) {
1318 if (!Advice->isInliningRecommended()) {
1319 Advice->recordUnattemptedInlining();
1320 return InlineCost::getNever("not previously inlined");
1321 }
1322 Advice->recordInlining();
1323 return InlineCost::getAlways("previously inlined");
1324 }
1325 }
1326
1327 return {};
1328}
1329
1330bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) {
1331 std::optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB);
1332 return Cost ? !!*Cost : false;
1333}
1334
1336SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
1337 if (std::optional<InlineCost> ReplayCost =
1338 getExternalInlineAdvisorCost(*Candidate.CallInstr))
1339 return *ReplayCost;
1340 // Adjust threshold based on call site hotness, only do this for callsite
1341 // prioritized inliner because otherwise cost-benefit check is done earlier.
1342 int SampleThreshold = SampleColdCallSiteThreshold;
1344 if (Candidate.CallsiteCount > PSI->getHotCountThreshold())
1345 SampleThreshold = SampleHotCallSiteThreshold;
1346 else if (!ProfileSizeInline)
1347 return InlineCost::getNever("cold callsite");
1348 }
1349
1350 Function *Callee = Candidate.CallInstr->getCalledFunction();
1351 assert(Callee && "Expect a definition for inline candidate of direct call");
1352
1353 InlineParams Params = getInlineParams();
1354 // We will ignore the threshold from inline cost, so always get full cost.
1355 Params.ComputeFullInlineCost = true;
1357 // Checks if there is anything in the reachable portion of the callee at
1358 // this callsite that makes this inlining potentially illegal. Need to
1359 // set ComputeFullInlineCost, otherwise getInlineCost may return early
1360 // when cost exceeds threshold without checking all IRs in the callee.
1361 // The acutal cost does not matter because we only checks isNever() to
1362 // see if it is legal to inline the callsite.
1363 InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
1364 GetTTI(*Callee), GetAC, GetTLI);
1365
1366 // Honor always inline and never inline from call analyzer
1367 if (Cost.isNever() || Cost.isAlways())
1368 return Cost;
1369
1370 // With CSSPGO, the preinliner in llvm-profgen can estimate global inline
1371 // decisions based on hotness as well as accurate function byte sizes for
1372 // given context using function/inlinee sizes from previous build. It
1373 // stores the decision in profile, and also adjust/merge context profile
1374 // aiming at better context-sensitive post-inline profile quality, assuming
1375 // all inline decision estimates are going to be honored by compiler. Here
1376 // we replay that inline decision under `sample-profile-use-preinliner`.
1377 // Note that we don't need to handle negative decision from preinliner as
1378 // context profile for not inlined calls are merged by preinliner already.
1379 if (UsePreInlinerDecision && Candidate.CalleeSamples) {
1380 // Once two node are merged due to promotion, we're losing some context
1381 // so the original context-sensitive preinliner decision should be ignored
1382 // for SyntheticContext.
1383 SampleContext &Context = Candidate.CalleeSamples->getContext();
1384 if (!Context.hasState(SyntheticContext) &&
1385 Context.hasAttribute(ContextShouldBeInlined))
1386 return InlineCost::getAlways("preinliner");
1387 }
1388
1389 // For old FDO inliner, we inline the call site as long as cost is not
1390 // "Never". The cost-benefit check is done earlier.
1392 return InlineCost::get(Cost.getCost(), INT_MAX);
1393 }
1394
1395 // Otherwise only use the cost from call analyzer, but overwite threshold with
1396 // Sample PGO threshold.
1397 return InlineCost::get(Cost.getCost(), SampleThreshold);
1398}
1399
1400bool SampleProfileLoader::inlineHotFunctionsWithPriority(
1401 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1402 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1403 // Profile symbol list is ignored when profile-sample-accurate is on.
1404 assert((!ProfAccForSymsInList ||
1406 !F.hasFnAttribute("profile-sample-accurate"))) &&
1407 "ProfAccForSymsInList should be false when profile-sample-accurate "
1408 "is enabled");
1409
1410 // Populating worklist with initial call sites from root inliner, along
1411 // with call site weights.
1412 CandidateQueue CQueue;
1413 InlineCandidate NewCandidate;
1414 for (auto &BB : F) {
1415 for (auto &I : BB) {
1416 auto *CB = dyn_cast<CallBase>(&I);
1417 if (!CB)
1418 continue;
1419 if (getInlineCandidate(&NewCandidate, CB))
1420 CQueue.push(NewCandidate);
1421 }
1422 }
1423
1424 // Cap the size growth from profile guided inlining. This is needed even
1425 // though cost of each inline candidate already accounts for callee size,
1426 // because with top-down inlining, we can grow inliner size significantly
1427 // with large number of smaller inlinees each pass the cost check.
1429 "Max inline size limit should not be smaller than min inline size "
1430 "limit.");
1431 unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;
1432 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
1433 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
1434 if (ExternalInlineAdvisor)
1435 SizeLimit = std::numeric_limits<unsigned>::max();
1436
1437 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1438
1439 // Perform iterative BFS call site prioritized inlining
1440 bool Changed = false;
1441 while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
1442 InlineCandidate Candidate = CQueue.top();
1443 CQueue.pop();
1444 CallBase *I = Candidate.CallInstr;
1445 Function *CalledFunction = I->getCalledFunction();
1446
1447 if (CalledFunction == &F)
1448 continue;
1449 if (I->isIndirectCall()) {
1450 uint64_t Sum = 0;
1451 auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
1452 uint64_t SumOrigin = Sum;
1453 Sum *= Candidate.CallsiteDistribution;
1454 unsigned ICPCount = 0;
1455 for (const auto *FS : CalleeSamples) {
1456 // TODO: Consider disable pre-lTO ICP for MonoLTO as well
1457 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1458 findExternalInlineCandidate(I, FS, InlinedGUIDs, SymbolMap,
1459 PSI->getOrCompHotCountThreshold());
1460 continue;
1461 }
1462 uint64_t EntryCountDistributed =
1463 FS->getHeadSamplesEstimate() * Candidate.CallsiteDistribution;
1464 // In addition to regular inline cost check, we also need to make sure
1465 // ICP isn't introducing excessive speculative checks even if individual
1466 // target looks beneficial to promote and inline. That means we should
1467 // only do ICP when there's a small number dominant targets.
1468 if (ICPCount >= ProfileICPRelativeHotnessSkip &&
1469 EntryCountDistributed * 100 < SumOrigin * ProfileICPRelativeHotness)
1470 break;
1471 // TODO: Fix CallAnalyzer to handle all indirect calls.
1472 // For indirect call, we don't run CallAnalyzer to get InlineCost
1473 // before actual inlining. This is because we could see two different
1474 // types from the same definition, which makes CallAnalyzer choke as
1475 // it's expecting matching parameter type on both caller and callee
1476 // side. See example from PR18962 for the triggering cases (the bug was
1477 // fixed, but we generate different types).
1478 if (!PSI->isHotCount(EntryCountDistributed))
1479 break;
1480 SmallVector<CallBase *, 8> InlinedCallSites;
1481 // Attach function profile for promoted indirect callee, and update
1482 // call site count for the promoted inline candidate too.
1483 Candidate = {I, FS, EntryCountDistributed,
1484 Candidate.CallsiteDistribution};
1485 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
1486 &InlinedCallSites)) {
1487 for (auto *CB : InlinedCallSites) {
1488 if (getInlineCandidate(&NewCandidate, CB))
1489 CQueue.emplace(NewCandidate);
1490 }
1491 ICPCount++;
1492 Changed = true;
1493 } else if (!ContextTracker) {
1494 LocalNotInlinedCallSites.insert({I, FS});
1495 }
1496 }
1497 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1498 !CalledFunction->isDeclaration()) {
1499 SmallVector<CallBase *, 8> InlinedCallSites;
1500 if (tryInlineCandidate(Candidate, &InlinedCallSites)) {
1501 for (auto *CB : InlinedCallSites) {
1502 if (getInlineCandidate(&NewCandidate, CB))
1503 CQueue.emplace(NewCandidate);
1504 }
1505 Changed = true;
1506 } else if (!ContextTracker) {
1507 LocalNotInlinedCallSites.insert({I, Candidate.CalleeSamples});
1508 }
1509 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1510 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1511 InlinedGUIDs, SymbolMap,
1512 PSI->getOrCompHotCountThreshold());
1513 }
1514 }
1515
1516 if (!CQueue.empty()) {
1517 if (SizeLimit == (unsigned)ProfileInlineLimitMax)
1518 ++NumCSInlinedHitMaxLimit;
1519 else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
1520 ++NumCSInlinedHitMinLimit;
1521 else
1522 ++NumCSInlinedHitGrowthLimit;
1523 }
1524
1525 // For CS profile, profile for not inlined context will be merged when
1526 // base profile is being retrieved.
1528 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1529 return Changed;
1530}
1531
1532void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
1534 const Function &F) {
1535 // Accumulate not inlined callsite information into notInlinedSamples
1536 for (const auto &Pair : NonInlinedCallSites) {
1537 CallBase *I = Pair.first;
1538 Function *Callee = I->getCalledFunction();
1539 if (!Callee || Callee->isDeclaration())
1540 continue;
1541
1542 ORE->emit(
1543 OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), "NotInline",
1544 I->getDebugLoc(), I->getParent())
1545 << "previous inlining not repeated: '" << ore::NV("Callee", Callee)
1546 << "' into '" << ore::NV("Caller", &F) << "'");
1547
1548 ++NumCSNotInlined;
1549 const FunctionSamples *FS = Pair.second;
1550 if (FS->getTotalSamples() == 0 && FS->getHeadSamplesEstimate() == 0) {
1551 continue;
1552 }
1553
1554 // Do not merge a context that is already duplicated into the base profile.
1555 if (FS->getContext().hasAttribute(sampleprof::ContextDuplicatedIntoBase))
1556 continue;
1557
1558 if (ProfileMergeInlinee) {
1559 // A function call can be replicated by optimizations like callsite
1560 // splitting or jump threading and the replicates end up sharing the
1561 // sample nested callee profile instead of slicing the original
1562 // inlinee's profile. We want to do merge exactly once by filtering out
1563 // callee profiles with a non-zero head sample count.
1564 if (FS->getHeadSamples() == 0) {
1565 // Use entry samples as head samples during the merge, as inlinees
1566 // don't have head samples.
1567 const_cast<FunctionSamples *>(FS)->addHeadSamples(
1568 FS->getHeadSamplesEstimate());
1569
1570 // Note that we have to do the merge right after processing function.
1571 // This allows OutlineFS's profile to be used for annotation during
1572 // top-down processing of functions' annotation.
1573 FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee);
1574 OutlineFS->merge(*FS, 1);
1575 // Set outlined profile to be synthetic to not bias the inliner.
1576 OutlineFS->SetContextSynthetic();
1577 }
1578 } else {
1579 auto pair =
1580 notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
1581 pair.first->second.entryCount += FS->getHeadSamplesEstimate();
1582 }
1583 }
1584}
1585
1586/// Returns the sorted CallTargetMap \p M by count in descending order.
1590 for (const auto &I : SampleRecord::SortCallTargets(M)) {
1591 R.emplace_back(
1592 InstrProfValueData{FunctionSamples::getGUID(I.first), I.second});
1593 }
1594 return R;
1595}
1596
1597// Generate MD_prof metadata for every branch instruction using the
1598// edge weights computed during propagation.
1599void SampleProfileLoader::generateMDProfMetadata(Function &F) {
1600 // Generate MD_prof metadata for every branch instruction using the
1601 // edge weights computed during propagation.
1602 LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
1603 LLVMContext &Ctx = F.getContext();
1604 MDBuilder MDB(Ctx);
1605 for (auto &BI : F) {
1606 BasicBlock *BB = &BI;
1607
1608 if (BlockWeights[BB]) {
1609 for (auto &I : *BB) {
1610 if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
1611 continue;
1612 if (!cast<CallBase>(I).getCalledFunction()) {
1613 const DebugLoc &DLoc = I.getDebugLoc();
1614 if (!DLoc)
1615 continue;
1616 const DILocation *DIL = DLoc;
1617 const FunctionSamples *FS = findFunctionSamples(I);
1618 if (!FS)
1619 continue;
1621 auto T = FS->findCallTargetMapAt(CallSite);
1622 if (!T || T.get().empty())
1623 continue;
1625 // Prorate the callsite counts based on the pre-ICP distribution
1626 // factor to reflect what is already done to the callsite before
1627 // ICP, such as calliste cloning.
1628 if (std::optional<PseudoProbe> Probe = extractProbe(I)) {
1629 if (Probe->Factor < 1)
1630 T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
1631 }
1632 }
1633 SmallVector<InstrProfValueData, 2> SortedCallTargets =
1635 uint64_t Sum = 0;
1636 for (const auto &C : T.get())
1637 Sum += C.second;
1638 // With CSSPGO all indirect call targets are counted torwards the
1639 // original indirect call site in the profile, including both
1640 // inlined and non-inlined targets.
1642 if (const FunctionSamplesMap *M =
1643 FS->findFunctionSamplesMapAt(CallSite)) {
1644 for (const auto &NameFS : *M)
1645 Sum += NameFS.second.getHeadSamplesEstimate();
1646 }
1647 }
1648 if (Sum)
1649 updateIDTMetaData(I, SortedCallTargets, Sum);
1650 else if (OverwriteExistingWeights)
1651 I.setMetadata(LLVMContext::MD_prof, nullptr);
1652 } else if (!isa<IntrinsicInst>(&I)) {
1653 I.setMetadata(LLVMContext::MD_prof,
1654 MDB.createBranchWeights(
1655 {static_cast<uint32_t>(BlockWeights[BB])}));
1656 }
1657 }
1659 // Set profile metadata (possibly annotated by LTO prelink) to zero or
1660 // clear it for cold code.
1661 for (auto &I : *BB) {
1662 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1663 if (cast<CallBase>(I).isIndirectCall())
1664 I.setMetadata(LLVMContext::MD_prof, nullptr);
1665 else
1666 I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(0));
1667 }
1668 }
1669 }
1670
1671 Instruction *TI = BB->getTerminator();
1672 if (TI->getNumSuccessors() == 1)
1673 continue;
1674 if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI) &&
1675 !isa<IndirectBrInst>(TI))
1676 continue;
1677
1678 DebugLoc BranchLoc = TI->getDebugLoc();
1679 LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
1680 << ((BranchLoc) ? Twine(BranchLoc.getLine())
1681 : Twine("<UNKNOWN LOCATION>"))
1682 << ".\n");
1684 uint32_t MaxWeight = 0;
1685 Instruction *MaxDestInst;
1686 // Since profi treats multiple edges (multiway branches) as a single edge,
1687 // we need to distribute the computed weight among the branches. We do
1688 // this by evenly splitting the edge weight among destinations.
1690 std::vector<uint64_t> EdgeIndex;
1692 EdgeIndex.resize(TI->getNumSuccessors());
1693 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1694 const BasicBlock *Succ = TI->getSuccessor(I);
1695 EdgeIndex[I] = EdgeMultiplicity[Succ];
1696 EdgeMultiplicity[Succ]++;
1697 }
1698 }
1699 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1700 BasicBlock *Succ = TI->getSuccessor(I);
1701 Edge E = std::make_pair(BB, Succ);
1702 uint64_t Weight = EdgeWeights[E];
1703 LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
1704 // Use uint32_t saturated arithmetic to adjust the incoming weights,
1705 // if needed. Sample counts in profiles are 64-bit unsigned values,
1706 // but internally branch weights are expressed as 32-bit values.
1707 if (Weight > std::numeric_limits<uint32_t>::max()) {
1708 LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
1709 Weight = std::numeric_limits<uint32_t>::max();
1710 }
1711 if (!SampleProfileUseProfi) {
1712 // Weight is added by one to avoid propagation errors introduced by
1713 // 0 weights.
1714 Weights.push_back(static_cast<uint32_t>(Weight + 1));
1715 } else {
1716 // Profi creates proper weights that do not require "+1" adjustments but
1717 // we evenly split the weight among branches with the same destination.
1718 uint64_t W = Weight / EdgeMultiplicity[Succ];
1719 // Rounding up, if needed, so that first branches are hotter.
1720 if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ])
1721 W++;
1722 Weights.push_back(static_cast<uint32_t>(W));
1723 }
1724 if (Weight != 0) {
1725 if (Weight > MaxWeight) {
1726 MaxWeight = Weight;
1727 MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
1728 }
1729 }
1730 }
1731
1732 misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);
1733
1734 uint64_t TempWeight;
1735 // Only set weights if there is at least one non-zero weight.
1736 // In any other case, let the analyzer set weights.
1737 // Do not set weights if the weights are present unless under
1738 // OverwriteExistingWeights. In ThinLTO, the profile annotation is done
1739 // twice. If the first annotation already set the weights, the second pass
1740 // does not need to set it. With OverwriteExistingWeights, Blocks with zero
1741 // weight should have their existing metadata (possibly annotated by LTO
1742 // prelink) cleared.
1743 if (MaxWeight > 0 &&
1744 (!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) {
1745 LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
1746 TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
1747 ORE->emit([&]() {
1748 return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
1749 << "most popular destination for conditional branches at "
1750 << ore::NV("CondBranchesLoc", BranchLoc);
1751 });
1752 } else {
1754 TI->setMetadata(LLVMContext::MD_prof, nullptr);
1755 LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n");
1756 } else {
1757 LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
1758 }
1759 }
1760 }
1761}
1762
1763/// Once all the branch weights are computed, we emit the MD_prof
1764/// metadata on BB using the computed values for each of its branches.
1765///
1766/// \param F The function to query.
1767///
1768/// \returns true if \p F was modified. Returns false, otherwise.
1769bool SampleProfileLoader::emitAnnotations(Function &F) {
1770 bool Changed = false;
1771
1773 if (!ProbeManager->profileIsValid(F, *Samples)) {
1774 LLVM_DEBUG(
1775 dbgs() << "Profile is invalid due to CFG mismatch for Function "
1776 << F.getName() << "\n");
1777 ++NumMismatchedProfile;
1779 return false;
1780 }
1781 ++NumMatchedProfile;
1782 } else {
1783 if (getFunctionLoc(F) == 0)
1784 return false;
1785
1786 LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
1787 << F.getName() << ": " << getFunctionLoc(F) << "\n");
1788 }
1789
1790 DenseSet<GlobalValue::GUID> InlinedGUIDs;
1792 Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
1793 else
1794 Changed |= inlineHotFunctions(F, InlinedGUIDs);
1795
1796 Changed |= computeAndPropagateWeights(F, InlinedGUIDs);
1797
1798 if (Changed)
1799 generateMDProfMetadata(F);
1800
1801 emitCoverageRemarks(F);
1802 return Changed;
1803}
1804
1805std::unique_ptr<ProfiledCallGraph>
1806SampleProfileLoader::buildProfiledCallGraph(Module &M) {
1807 std::unique_ptr<ProfiledCallGraph> ProfiledCG;
1809 ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker);
1810 else
1811 ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles());
1812
1813 // Add all functions into the profiled call graph even if they are not in
1814 // the profile. This makes sure functions missing from the profile still
1815 // gets a chance to be processed.
1816 for (Function &F : M) {
1817 if (F.isDeclaration() || !F.hasFnAttribute("use-sample-profile"))
1818 continue;
1819 ProfiledCG->addProfiledFunction(FunctionSamples::getCanonicalFnName(F));
1820 }
1821
1822 return ProfiledCG;
1823}
1824
1825std::vector<Function *>
1826SampleProfileLoader::buildFunctionOrder(Module &M, LazyCallGraph &CG) {
1827 std::vector<Function *> FunctionOrderList;
1828 FunctionOrderList.reserve(M.size());
1829
1831 errs() << "WARNING: -use-profiled-call-graph ignored, should be used "
1832 "together with -sample-profile-top-down-load.\n";
1833
1834 if (!ProfileTopDownLoad) {
1835 if (ProfileMergeInlinee) {
1836 // Disable ProfileMergeInlinee if profile is not loaded in top down order,
1837 // because the profile for a function may be used for the profile
1838 // annotation of its outline copy before the profile merging of its
1839 // non-inlined inline instances, and that is not the way how
1840 // ProfileMergeInlinee is supposed to work.
1841 ProfileMergeInlinee = false;
1842 }
1843
1844 for (Function &F : M)
1845 if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile"))
1846 FunctionOrderList.push_back(&F);
1847 return FunctionOrderList;
1848 }
1849
1851 !UseProfiledCallGraph.getNumOccurrences())) {
1852 // Use profiled call edges to augment the top-down order. There are cases
1853 // that the top-down order computed based on the static call graph doesn't
1854 // reflect real execution order. For example
1855 //
1856 // 1. Incomplete static call graph due to unknown indirect call targets.
1857 // Adjusting the order by considering indirect call edges from the
1858 // profile can enable the inlining of indirect call targets by allowing
1859 // the caller processed before them.
1860 // 2. Mutual call edges in an SCC. The static processing order computed for
1861 // an SCC may not reflect the call contexts in the context-sensitive
1862 // profile, thus may cause potential inlining to be overlooked. The
1863 // function order in one SCC is being adjusted to a top-down order based
1864 // on the profile to favor more inlining. This is only a problem with CS
1865 // profile.
1866 // 3. Transitive indirect call edges due to inlining. When a callee function
1867 // (say B) is inlined into into a caller function (say A) in LTO prelink,
1868 // every call edge originated from the callee B will be transferred to
1869 // the caller A. If any transferred edge (say A->C) is indirect, the
1870 // original profiled indirect edge B->C, even if considered, would not
1871 // enforce a top-down order from the caller A to the potential indirect
1872 // call target C in LTO postlink since the inlined callee B is gone from
1873 // the static call graph.
1874 // 4. #3 can happen even for direct call targets, due to functions defined
1875 // in header files. A header function (say A), when included into source
1876 // files, is defined multiple times but only one definition survives due
1877 // to ODR. Therefore, the LTO prelink inlining done on those dropped
1878 // definitions can be useless based on a local file scope. More
1879 // importantly, the inlinee (say B), once fully inlined to a
1880 // to-be-dropped A, will have no profile to consume when its outlined
1881 // version is compiled. This can lead to a profile-less prelink
1882 // compilation for the outlined version of B which may be called from
1883 // external modules. while this isn't easy to fix, we rely on the
1884 // postlink AutoFDO pipeline to optimize B. Since the survived copy of
1885 // the A can be inlined in its local scope in prelink, it may not exist
1886 // in the merged IR in postlink, and we'll need the profiled call edges
1887 // to enforce a top-down order for the rest of the functions.
1888 //
1889 // Considering those cases, a profiled call graph completely independent of
1890 // the static call graph is constructed based on profile data, where
1891 // function objects are not even needed to handle case #3 and case 4.
1892 //
1893 // Note that static callgraph edges are completely ignored since they
1894 // can be conflicting with profiled edges for cyclic SCCs and may result in
1895 // an SCC order incompatible with profile-defined one. Using strictly
1896 // profile order ensures a maximum inlining experience. On the other hand,
1897 // static call edges are not so important when they don't correspond to a
1898 // context in the profile.
1899
1900 std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(M);
1901 scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());
1902 while (!CGI.isAtEnd()) {
1903 auto Range = *CGI;
1904 if (SortProfiledSCC) {
1905 // Sort nodes in one SCC based on callsite hotness.
1907 Range = *SI;
1908 }
1909 for (auto *Node : Range) {
1910 Function *F = SymbolMap.lookup(Node->Name);
1911 if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
1912 FunctionOrderList.push_back(F);
1913 }
1914 ++CGI;
1915 }
1916 } else {
1917 CG.buildRefSCCs();
1918 for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) {
1919 for (LazyCallGraph::SCC &C : RC) {
1920 for (LazyCallGraph::Node &N : C) {
1921 Function &F = N.getFunction();
1922 if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile"))
1923 FunctionOrderList.push_back(&F);
1924 }
1925 }
1926 }
1927 }
1928
1929 std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
1930
1931 LLVM_DEBUG({
1932 dbgs() << "Function processing order:\n";
1933 for (auto F : FunctionOrderList) {
1934 dbgs() << F->getName() << "\n";
1935 }
1936 });
1937
1938 return FunctionOrderList;
1939}
1940
1941bool SampleProfileLoader::doInitialization(Module &M,
1943 auto &Ctx = M.getContext();
1944
1945 auto ReaderOrErr = SampleProfileReader::create(
1946 Filename, Ctx, *FS, FSDiscriminatorPass::Base, RemappingFilename);
1947 if (std::error_code EC = ReaderOrErr.getError()) {
1948 std::string Msg = "Could not open profile: " + EC.message();
1949 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1950 return false;
1951 }
1952 Reader = std::move(ReaderOrErr.get());
1953 Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink);
1954 // set module before reading the profile so reader may be able to only
1955 // read the function profiles which are used by the current module.
1956 Reader->setModule(&M);
1957 if (std::error_code EC = Reader->read()) {
1958 std::string Msg = "profile reading failed: " + EC.message();
1959 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1960 return false;
1961 }
1962
1963 PSL = Reader->getProfileSymbolList();
1964
1965 // While profile-sample-accurate is on, ignore symbol list.
1966 ProfAccForSymsInList =
1968 if (ProfAccForSymsInList) {
1969 NamesInProfile.clear();
1970 if (auto NameTable = Reader->getNameTable())
1971 NamesInProfile.insert(NameTable->begin(), NameTable->end());
1972 CoverageTracker.setProfAccForSymsInList(true);
1973 }
1974
1975 if (FAM && !ProfileInlineReplayFile.empty()) {
1976 ExternalInlineAdvisor = getReplayInlineAdvisor(
1977 M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr,
1982 /*EmitRemarks=*/false, InlineContext{LTOPhase, InlinePass::ReplaySampleProfileInliner});
1983 }
1984
1985 // Apply tweaks if context-sensitive or probe-based profile is available.
1986 if (Reader->profileIsCS() || Reader->profileIsPreInlined() ||
1987 Reader->profileIsProbeBased()) {
1988 if (!UseIterativeBFIInference.getNumOccurrences())
1990 if (!SampleProfileUseProfi.getNumOccurrences())
1991 SampleProfileUseProfi = true;
1992 if (!EnableExtTspBlockPlacement.getNumOccurrences())
1994 // Enable priority-base inliner and size inline by default for CSSPGO.
1995 if (!ProfileSizeInline.getNumOccurrences())
1996 ProfileSizeInline = true;
1997 if (!CallsitePrioritizedInline.getNumOccurrences())
1999 // For CSSPGO, we also allow recursive inline to best use context profile.
2000 if (!AllowRecursiveInline.getNumOccurrences())
2001 AllowRecursiveInline = true;
2002
2003 if (Reader->profileIsPreInlined()) {
2004 if (!UsePreInlinerDecision.getNumOccurrences())
2005 UsePreInlinerDecision = true;
2006 }
2007
2008 if (!Reader->profileIsCS()) {
2009 // Non-CS profile should be fine without a function size budget for the
2010 // inliner since the contexts in the profile are either all from inlining
2011 // in the prevoius build or pre-computed by the preinliner with a size
2012 // cap, thus they are bounded.
2013 if (!ProfileInlineLimitMin.getNumOccurrences())
2014 ProfileInlineLimitMin = std::numeric_limits<unsigned>::max();
2015 if (!ProfileInlineLimitMax.getNumOccurrences())
2016 ProfileInlineLimitMax = std::numeric_limits<unsigned>::max();
2017 }
2018 }
2019
2020 if (Reader->profileIsCS()) {
2021 // Tracker for profiles under different context
2022 ContextTracker = std::make_unique<SampleContextTracker>(
2023 Reader->getProfiles(), &GUIDToFuncNameMap);
2024 }
2025
2026 // Load pseudo probe descriptors for probe-based function samples.
2027 if (Reader->profileIsProbeBased()) {
2028 ProbeManager = std::make_unique<PseudoProbeManager>(M);
2029 if (!ProbeManager->moduleIsProbed(M)) {
2030 const char *Msg =
2031 "Pseudo-probe-based profile requires SampleProfileProbePass";
2032 Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg,
2033 DS_Warning));
2034 return false;
2035 }
2036 }
2037
2040 MatchingManager =
2041 std::make_unique<SampleProfileMatcher>(M, *Reader, ProbeManager.get());
2042 }
2043
2044 return true;
2045}
2046
2047void SampleProfileMatcher::countProfileMismatches(
2048 const FunctionSamples &FS,
2049 const std::unordered_set<LineLocation, LineLocationHash>
2050 &MatchedCallsiteLocs,
2051 uint64_t &FuncMismatchedCallsites, uint64_t &FuncProfiledCallsites) {
2052
2053 auto isInvalidLineOffset = [](uint32_t LineOffset) {
2054 return LineOffset & 0x8000;
2055 };
2056
2057 // Check if there are any callsites in the profile that does not match to any
2058 // IR callsites, those callsite samples will be discarded.
2059 for (auto &I : FS.getBodySamples()) {
2060 const LineLocation &Loc = I.first;
2061 if (isInvalidLineOffset(Loc.LineOffset))
2062 continue;
2063
2064 uint64_t Count = I.second.getSamples();
2065 if (!I.second.getCallTargets().empty()) {
2066 TotalCallsiteSamples += Count;
2067 FuncProfiledCallsites++;
2068 if (!MatchedCallsiteLocs.count(Loc)) {
2069 MismatchedCallsiteSamples += Count;
2070 FuncMismatchedCallsites++;
2071 }
2072 }
2073 }
2074
2075 for (auto &I : FS.getCallsiteSamples()) {
2076 const LineLocation &Loc = I.first;
2077 if (isInvalidLineOffset(Loc.LineOffset))
2078 continue;
2079
2080 uint64_t Count = 0;
2081 for (auto &FM : I.second) {
2082 Count += FM.second.getHeadSamplesEstimate();
2083 }
2084 TotalCallsiteSamples += Count;
2085 FuncProfiledCallsites++;
2086 if (!MatchedCallsiteLocs.count(Loc)) {
2087 MismatchedCallsiteSamples += Count;
2088 FuncMismatchedCallsites++;
2089 }
2090 }
2091}
2092
2093// Populate the anchors(direct callee name) from profile.
2094void SampleProfileMatcher::populateProfileCallsites(
2095 const FunctionSamples &FS,
2096 StringMap<std::set<LineLocation>> &CalleeToCallsitesMap) {
2097 for (const auto &I : FS.getBodySamples()) {
2098 const auto &Loc = I.first;
2099 const auto &CTM = I.second.getCallTargets();
2100 // Filter out possible indirect calls, use direct callee name as anchor.
2101 if (CTM.size() == 1) {
2102 StringRef CalleeName = CTM.begin()->first();
2103 const auto &Candidates = CalleeToCallsitesMap.try_emplace(
2104 CalleeName, std::set<LineLocation>());
2105 Candidates.first->second.insert(Loc);
2106 }
2107 }
2108
2109 for (const auto &I : FS.getCallsiteSamples()) {
2110 const LineLocation &Loc = I.first;
2111 const auto &CalleeMap = I.second;
2112 // Filter out possible indirect calls, use direct callee name as anchor.
2113 if (CalleeMap.size() == 1) {
2114 StringRef CalleeName = CalleeMap.begin()->first;
2115 const auto &Candidates = CalleeToCallsitesMap.try_emplace(
2116 CalleeName, std::set<LineLocation>());
2117 Candidates.first->second.insert(Loc);
2118 }
2119 }
2120}
2121
2122// Call target name anchor based profile fuzzy matching.
2123// Input:
2124// For IR locations, the anchor is the callee name of direct callsite; For
2125// profile locations, it's the call target name for BodySamples or inlinee's
2126// profile name for CallsiteSamples.
2127// Matching heuristic:
2128// First match all the anchors in lexical order, then split the non-anchor
2129// locations between the two anchors evenly, first half are matched based on the
2130// start anchor, second half are matched based on the end anchor.
2131// For example, given:
2132// IR locations: [1, 2(foo), 3, 5, 6(bar), 7]
2133// Profile locations: [1, 2, 3(foo), 4, 7, 8(bar), 9]
2134// The matching gives:
2135// [1, 2(foo), 3, 5, 6(bar), 7]
2136// | | | | | |
2137// [1, 2, 3(foo), 4, 7, 8(bar), 9]
2138// The output mapping: [2->3, 3->4, 5->7, 6->8, 7->9].
2139void SampleProfileMatcher::runStaleProfileMatching(
2140 const std::map<LineLocation, StringRef> &IRLocations,
2141 StringMap<std::set<LineLocation>> &CalleeToCallsitesMap,
2142 LocToLocMap &IRToProfileLocationMap) {
2143 assert(IRToProfileLocationMap.empty() &&
2144 "Run stale profile matching only once per function");
2145
2146 auto InsertMatching = [&](const LineLocation &From, const LineLocation &To) {
2147 // Skip the unchanged location mapping to save memory.
2148 if (From != To)
2149 IRToProfileLocationMap.insert({From, To});
2150 };
2151
2152 // Use function's beginning location as the initial anchor.
2153 int32_t LocationDelta = 0;
2154 SmallVector<LineLocation> LastMatchedNonAnchors;
2155
2156 for (const auto &IR : IRLocations) {
2157 const auto &Loc = IR.first;
2158 StringRef CalleeName = IR.second;
2159 bool IsMatchedAnchor = false;
2160 // Match the anchor location in lexical order.
2161 if (!CalleeName.empty()) {
2162 auto ProfileAnchors = CalleeToCallsitesMap.find(CalleeName);
2163 if (ProfileAnchors != CalleeToCallsitesMap.end() &&
2164 !ProfileAnchors->second.empty()) {
2165 auto CI = ProfileAnchors->second.begin();
2166 const auto Candidate = *CI;
2167 ProfileAnchors->second.erase(CI);
2168 InsertMatching(Loc, Candidate);
2169 LLVM_DEBUG(dbgs() << "Callsite with callee:" << CalleeName
2170 << " is matched from " << Loc << " to " << Candidate
2171 << "\n");
2172 LocationDelta = Candidate.LineOffset - Loc.LineOffset;
2173
2174 // Match backwards for non-anchor locations.
2175 // The locations in LastMatchedNonAnchors have been matched forwards
2176 // based on the previous anchor, spilt it evenly and overwrite the
2177 // second half based on the current anchor.
2178 for (size_t I = (LastMatchedNonAnchors.size() + 1) / 2;
2179 I < LastMatchedNonAnchors.size(); I++) {
2180 const auto &L = LastMatchedNonAnchors[I];
2181 uint32_t CandidateLineOffset = L.LineOffset + LocationDelta;
2182 LineLocation Candidate(CandidateLineOffset, L.Discriminator);
2183 InsertMatching(L, Candidate);
2184 LLVM_DEBUG(dbgs() << "Location is rematched backwards from " << L
2185 << " to " << Candidate << "\n");
2186 }
2187
2188 IsMatchedAnchor = true;
2189 LastMatchedNonAnchors.clear();
2190 }
2191 }
2192
2193 // Match forwards for non-anchor locations.
2194 if (!IsMatchedAnchor) {
2195 uint32_t CandidateLineOffset = Loc.LineOffset + LocationDelta;
2196 LineLocation Candidate(CandidateLineOffset, Loc.Discriminator);
2197 InsertMatching(Loc, Candidate);
2198 LLVM_DEBUG(dbgs() << "Location is matched from " << Loc << " to "
2199 << Candidate << "\n");
2200 LastMatchedNonAnchors.emplace_back(Loc);
2201 }
2202 }
2203}
2204
2205void SampleProfileMatcher::runOnFunction(const Function &F,
2206 const FunctionSamples &FS) {
2207 bool IsFuncHashMismatch = false;
2209 uint64_t Count = FS.getTotalSamples();
2210 TotalFuncHashSamples += Count;
2211 TotalProfiledFunc++;
2212 if (!ProbeManager->profileIsValid(F, FS)) {
2213 MismatchedFuncHashSamples += Count;
2214 NumMismatchedFuncHash++;
2215 IsFuncHashMismatch = true;
2216 }
2217 }
2218
2219 std::unordered_set<LineLocation, LineLocationHash> MatchedCallsiteLocs;
2220 // The value of the map is the name of direct callsite and use empty StringRef
2221 // for non-direct-call site.
2222 std::map<LineLocation, StringRef> IRLocations;
2223
2224 // Extract profile matching anchors and profile mismatch metrics in the IR.
2225 for (auto &BB : F) {
2226 for (auto &I : BB) {
2227 // TODO: Support line-number based location(AutoFDO).
2228 if (FunctionSamples::ProfileIsProbeBased && isa<PseudoProbeInst>(&I)) {
2229 if (std::optional<PseudoProbe> Probe = extractProbe(I))
2230 IRLocations.emplace(LineLocation(Probe->Id, 0), StringRef());
2231 }
2232
2233 if (!isa<CallBase>(&I) || isa<IntrinsicInst>(&I))
2234 continue;
2235
2236 const auto *CB = dyn_cast<CallBase>(&I);
2237 if (auto &DLoc = I.getDebugLoc()) {
2239
2240 StringRef CalleeName;
2241 if (Function *Callee = CB->getCalledFunction())
2242 CalleeName = FunctionSamples::getCanonicalFnName(Callee->getName());
2243
2244 // Force to overwrite the callee name in case any non-call location was
2245 // written before.
2246 auto R = IRLocations.emplace(IRCallsite, CalleeName);
2247 R.first->second = CalleeName;
2249 R.first->second == CalleeName) &&
2250 "Overwrite non-call or different callee name location for "
2251 "pseudo probe callsite");
2252
2253 // Go through all the callsites on the IR and flag the callsite if the
2254 // target name is the same as the one in the profile.
2255 const auto CTM = FS.findCallTargetMapAt(IRCallsite);
2256 const auto CallsiteFS = FS.findFunctionSamplesMapAt(IRCallsite);
2257
2258 // Indirect call case.
2259 if (CalleeName.empty()) {
2260 // Since indirect call does not have the CalleeName, check
2261 // conservatively if callsite in the profile is a callsite location.
2262 // This is to avoid nums of false positive since otherwise all the
2263 // indirect call samples will be reported as mismatching.
2264 if ((CTM && !CTM->empty()) || (CallsiteFS && !CallsiteFS->empty()))
2265 MatchedCallsiteLocs.insert(IRCallsite);
2266 } else {
2267 // Check if the call target name is matched for direct call case.
2268 if ((CTM && CTM->count(CalleeName)) ||
2269 (CallsiteFS && CallsiteFS->count(CalleeName)))
2270 MatchedCallsiteLocs.insert(IRCallsite);
2271 }
2272 }
2273 }
2274 }
2275
2276 // Detect profile mismatch for profile staleness metrics report.
2278 uint64_t FuncMismatchedCallsites = 0;
2279 uint64_t FuncProfiledCallsites = 0;
2280 countProfileMismatches(FS, MatchedCallsiteLocs, FuncMismatchedCallsites,
2281 FuncProfiledCallsites);
2282 TotalProfiledCallsites += FuncProfiledCallsites;
2283 NumMismatchedCallsites += FuncMismatchedCallsites;
2284 LLVM_DEBUG({
2285 if (FunctionSamples::ProfileIsProbeBased && !IsFuncHashMismatch &&
2286 FuncMismatchedCallsites)
2287 dbgs() << "Function checksum is matched but there are "
2288 << FuncMismatchedCallsites << "/" << FuncProfiledCallsites
2289 << " mismatched callsites.\n";
2290 });
2291 }
2292
2293 if (IsFuncHashMismatch && SalvageStaleProfile) {
2294 LLVM_DEBUG(dbgs() << "Run stale profile matching for " << F.getName()
2295 << "\n");
2296
2297 StringMap<std::set<LineLocation>> CalleeToCallsitesMap;
2298 populateProfileCallsites(FS, CalleeToCallsitesMap);
2299
2300 // The matching result will be saved to IRToProfileLocationMap, create a new
2301 // map for each function.
2302 auto &IRToProfileLocationMap = getIRToProfileLocationMap(F);
2303
2304 runStaleProfileMatching(IRLocations, CalleeToCallsitesMap,
2305 IRToProfileLocationMap);
2306 }
2307}
2308
2309void SampleProfileMatcher::runOnModule() {
2310 for (auto &F : M) {
2311 if (F.isDeclaration() || !F.hasFnAttribute("use-sample-profile"))
2312 continue;
2313 FunctionSamples *FS = nullptr;
2315 FS = getFlattenedSamplesFor(F);
2316 else
2317 FS = Reader.getSamplesFor(F);
2318 if (!FS)
2319 continue;
2320 runOnFunction(F, *FS);
2321 }
2323 distributeIRToProfileLocationMap();
2324
2327 errs() << "(" << NumMismatchedFuncHash << "/" << TotalProfiledFunc << ")"
2328 << " of functions' profile are invalid and "
2329 << " (" << MismatchedFuncHashSamples << "/" << TotalFuncHashSamples
2330 << ")"
2331 << " of samples are discarded due to function hash mismatch.\n";
2332 }
2333 errs() << "(" << NumMismatchedCallsites << "/" << TotalProfiledCallsites
2334 << ")"
2335 << " of callsites' profile are invalid and "
2336 << "(" << MismatchedCallsiteSamples << "/" << TotalCallsiteSamples
2337 << ")"
2338 << " of samples are discarded due to callsite location mismatch.\n";
2339 }
2340
2342 LLVMContext &Ctx = M.getContext();
2343 MDBuilder MDB(Ctx);
2344
2347 ProfStatsVec.emplace_back("NumMismatchedFuncHash", NumMismatchedFuncHash);
2348 ProfStatsVec.emplace_back("TotalProfiledFunc", TotalProfiledFunc);
2349 ProfStatsVec.emplace_back("MismatchedFuncHashSamples",
2350 MismatchedFuncHashSamples);
2351 ProfStatsVec.emplace_back("TotalFuncHashSamples", TotalFuncHashSamples);
2352 }
2353
2354 ProfStatsVec.emplace_back("NumMismatchedCallsites", NumMismatchedCallsites);
2355 ProfStatsVec.emplace_back("TotalProfiledCallsites", TotalProfiledCallsites);
2356 ProfStatsVec.emplace_back("MismatchedCallsiteSamples",
2357 MismatchedCallsiteSamples);
2358 ProfStatsVec.emplace_back("TotalCallsiteSamples", TotalCallsiteSamples);
2359
2360 auto *MD = MDB.createLLVMStats(ProfStatsVec);
2361 auto *NMD = M.getOrInsertNamedMetadata("llvm.stats");
2362 NMD->addOperand(MD);
2363 }
2364}
2365
2366void SampleProfileMatcher::distributeIRToProfileLocationMap(
2367 FunctionSamples &FS) {
2368 const auto ProfileMappings = FuncMappings.find(FS.getName());
2369 if (ProfileMappings != FuncMappings.end()) {
2370 FS.setIRToProfileLocationMap(&(ProfileMappings->second));
2371 }
2372
2373 for (auto &Inlinees : FS.getCallsiteSamples()) {
2374 for (auto FS : Inlinees.second) {
2375 distributeIRToProfileLocationMap(FS.second);
2376 }
2377 }
2378}
2379
2380// Use a central place to distribute the matching results. Outlined and inlined
2381// profile with the function name will be set to the same pointer.
2382void SampleProfileMatcher::distributeIRToProfileLocationMap() {
2383 for (auto &I : Reader.getProfiles()) {
2384 distributeIRToProfileLocationMap(I.second);
2385 }
2386}
2387
2388bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
2389 ProfileSummaryInfo *_PSI,
2390 LazyCallGraph &CG) {
2391 GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
2392
2393 PSI = _PSI;
2394 if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
2395 M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
2397 PSI->refresh();
2398 }
2399 // Compute the total number of samples collected in this profile.
2400 for (const auto &I : Reader->getProfiles())
2401 TotalCollectedSamples += I.second.getTotalSamples();
2402
2403 auto Remapper = Reader->getRemapper();
2404 // Populate the symbol map.
2405 for (const auto &N_F : M.getValueSymbolTable()) {
2406 StringRef OrigName = N_F.getKey();
2407 Function *F = dyn_cast<Function>(N_F.getValue());
2408 if (F == nullptr || OrigName.empty())
2409 continue;
2410 SymbolMap[OrigName] = F;
2412 if (OrigName != NewName && !NewName.empty()) {
2413 auto r = SymbolMap.insert(std::make_pair(NewName, F));
2414 // Failiing to insert means there is already an entry in SymbolMap,
2415 // thus there are multiple functions that are mapped to the same
2416 // stripped name. In this case of name conflicting, set the value
2417 // to nullptr to avoid confusion.
2418 if (!r.second)
2419 r.first->second = nullptr;
2420 OrigName = NewName;
2421 }
2422 // Insert the remapped names into SymbolMap.
2423 if (Remapper) {
2424 if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
2425 if (*MapName != OrigName && !MapName->empty())
2426 SymbolMap.insert(std::make_pair(*MapName, F));
2427 }
2428 }
2429 }
2430 assert(SymbolMap.count(StringRef()) == 0 &&
2431 "No empty StringRef should be added in SymbolMap");
2432
2435 MatchingManager->runOnModule();
2436 }
2437
2438 bool retval = false;
2439 for (auto *F : buildFunctionOrder(M, CG)) {
2440 assert(!F->isDeclaration());
2441 clearFunctionData();
2442 retval |= runOnFunction(*F, AM);
2443 }
2444
2445 // Account for cold calls not inlined....
2447 for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
2448 notInlinedCallInfo)
2449 updateProfileCallee(pair.first, pair.second.entryCount);
2450
2451 return retval;
2452}
2453
2454bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
2455 LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
2456 DILocation2SampleMap.clear();
2457 // By default the entry count is initialized to -1, which will be treated
2458 // conservatively by getEntryCount as the same as unknown (None). This is
2459 // to avoid newly added code to be treated as cold. If we have samples
2460 // this will be overwritten in emitAnnotations.
2461 uint64_t initialEntryCount = -1;
2462
2463 ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
2464 if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
2465 // initialize all the function entry counts to 0. It means all the
2466 // functions without profile will be regarded as cold.
2467 initialEntryCount = 0;
2468 // profile-sample-accurate is a user assertion which has a higher precedence
2469 // than symbol list. When profile-sample-accurate is on, ignore symbol list.
2470 ProfAccForSymsInList = false;
2471 }
2472 CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList);
2473
2474 // PSL -- profile symbol list include all the symbols in sampled binary.
2475 // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
2476 // old functions without samples being cold, without having to worry
2477 // about new and hot functions being mistakenly treated as cold.
2478 if (ProfAccForSymsInList) {
2479 // Initialize the entry count to 0 for functions in the list.
2480 if (PSL->contains(F.getName()))
2481 initialEntryCount = 0;
2482
2483 // Function in the symbol list but without sample will be regarded as
2484 // cold. To minimize the potential negative performance impact it could
2485 // have, we want to be a little conservative here saying if a function
2486 // shows up in the profile, no matter as outline function, inline instance
2487 // or call targets, treat the function as not being cold. This will handle
2488 // the cases such as most callsites of a function are inlined in sampled
2489 // binary but not inlined in current build (because of source code drift,
2490 // imprecise debug information, or the callsites are all cold individually
2491 // but not cold accumulatively...), so the outline function showing up as
2492 // cold in sampled binary will actually not be cold after current build.
2494 if (NamesInProfile.count(CanonName))
2495 initialEntryCount = -1;
2496 }
2497
2498 // Initialize entry count when the function has no existing entry
2499 // count value.
2500 if (!F.getEntryCount())
2501 F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
2502 std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
2503 if (AM) {
2504 auto &FAM =
2506 .getManager();
2508 } else {
2509 OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
2510 ORE = OwnedORE.get();
2511 }
2512
2514 Samples = ContextTracker->getBaseSamplesFor(F);
2515 else
2516 Samples = Reader->getSamplesFor(F);
2517
2518 if (Samples && !Samples->empty())
2519 return emitAnnotations(F);
2520 return false;
2521}
2523 std::string File, std::string RemappingFile, ThinOrFullLTOPhase LTOPhase,
2525 : ProfileFileName(File), ProfileRemappingFileName(RemappingFile),
2526 LTOPhase(LTOPhase), FS(std::move(FS)) {}
2527
2532
2533 auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
2535 };
2536 auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
2538 };
2539 auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
2541 };
2542
2543 if (!FS)
2545
2546 SampleProfileLoader SampleLoader(
2547 ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
2548 ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
2549 : ProfileRemappingFileName,
2550 LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI);
2551
2552 if (!SampleLoader.doInitialization(M, &FAM))
2553 return PreservedAnalyses::all();
2554
2557 if (!SampleLoader.runOnModule(M, &AM, PSI, CG))
2558 return PreservedAnalyses::all();
2559
2560 return PreservedAnalyses::none();
2561}
This file defines the StringMap class.
amdgpu Simplify well known AMD library false FunctionCallee Callee
BlockVerifier::State From
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:678
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
static bool runOnFunction(Function &F, bool PostInlining)
Provides ErrorOr<T> smart pointer.
static cl::opt< unsigned > SizeLimit("eif-limit", cl::init(6), cl::Hidden, cl::desc("Size limit in Hexagon early if-conversion"))
LVReader * CurrentReader
Definition: LVReader.cpp:153
Implements a lazy call graph analysis and related passes for the new pass manager.
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:81
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file implements a map that provides insertion order iteration.
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
FunctionAnalysisManager FAM
This header defines various interfaces for pass management in LLVM.
This file defines the PriorityQueue class.
This builds on the llvm/ADT/GraphTraits.h file to find the strongly connected components (SCCs) of a ...
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file provides the interface for context-sensitive profile tracker used by CSSPGO.
This file provides the interface for the sampled PGO profile loader base implementation.
This file provides the utility functions for the sampled PGO loader base implementation.
This file provides the interface for the pseudo probe implementation for AutoFDO.
static cl::opt< std::string > SampleProfileFile("sample-profile-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile file loaded by -sample-profile"), cl::Hidden)
static cl::opt< bool > FlattenProfileForMatching("flatten-profile-for-matching", cl::Hidden, cl::init(true), cl::desc("Use flattened profile for stale profile detection and matching."))
static cl::opt< bool > ProfileSampleBlockAccurate("profile-sample-block-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "branches and calls as having 0 samples. Otherwise, treat " "them conservatively as unknown. "))
static cl::opt< unsigned > MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden, cl::desc("Max number of promotions for a single indirect " "call callsite in sample profile loader"))
static cl::opt< ReplayInlinerSettings::Fallback > ProfileInlineReplayFallback("sample-profile-inline-replay-fallback", cl::init(ReplayInlinerSettings::Fallback::Original), cl::values(clEnumValN(ReplayInlinerSettings::Fallback::Original, "Original", "All decisions not in replay send to original advisor (default)"), clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline, "AlwaysInline", "All decisions not in replay are inlined"), clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline", "All decisions not in replay are not inlined")), cl::desc("How sample profile inline replay treats sites that don't come " "from the replay. Original: defers to original advisor, " "AlwaysInline: inline all sites not in replay, NeverInline: " "inline no sites not in replay"), cl::Hidden)
static cl::opt< bool > OverwriteExistingWeights("overwrite-existing-weights", cl::Hidden, cl::init(false), cl::desc("Ignore existing branch weights on IR and always overwrite."))
static void updateIDTMetaData(Instruction &Inst, const SmallVectorImpl< InstrProfValueData > &CallTargets, uint64_t Sum)
Update indirect call target profile metadata for Inst.
static cl::opt< bool > AnnotateSampleProfileInlinePhase("annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false), cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for " "sample-profile inline pass name."))
static cl::opt< std::string > ProfileInlineReplayFile("sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"), cl::desc("Optimization remarks file containing inline remarks to be replayed " "by inlining from sample profile loader."), cl::Hidden)
static cl::opt< bool > ProfileMergeInlinee("sample-profile-merge-inlinee", cl::Hidden, cl::init(true), cl::desc("Merge past inlinee's profile to outline version if sample " "profile loader decided not to inline a call site. It will " "only be enabled when top-down order of profile loading is " "enabled. "))
static cl::opt< bool > PersistProfileStaleness("persist-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute stale profile statistical metrics and write it into the " "native object file(.llvm_stats section)."))
static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate)
Check whether the indirect call promotion history of Inst allows the promotion for Candidate.
static SmallVector< InstrProfValueData, 2 > GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M)
Returns the sorted CallTargetMap M by count in descending order.
#define CSINLINE_DEBUG
static cl::opt< bool > UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden, cl::desc("Process functions in a top-down order " "defined by the profiled call graph when " "-sample-profile-top-down-load is on."))
static cl::opt< ReplayInlinerSettings::Scope > ProfileInlineReplayScope("sample-profile-inline-replay-scope", cl::init(ReplayInlinerSettings::Scope::Function), cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function", "Replay on functions that have remarks associated " "with them (default)"), clEnumValN(ReplayInlinerSettings::Scope::Module, "Module", "Replay on the entire module")), cl::desc("Whether inline replay should be applied to the entire " "Module or just the Functions (default) that are present as " "callers in remarks during sample profile inlining."), cl::Hidden)
static cl::opt< unsigned > ProfileICPRelativeHotness("sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25), cl::desc("Relative hotness percentage threshold for indirect " "call promotion in proirity-based sample profile loader inlining."))
Function::ProfileCount ProfileCount
static cl::opt< unsigned > ProfileICPRelativeHotnessSkip("sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1), cl::desc("Skip relative hotness check for ICP up to given number of targets."))
static cl::opt< bool > ReportProfileStaleness("report-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute and report stale profile statistical metrics."))
static cl::opt< bool > UsePreInlinerDecision("sample-profile-use-preinliner", cl::Hidden, cl::desc("Use the preinliner decisions stored in profile context."))
static cl::opt< bool > ProfileAccurateForSymsInList("profile-accurate-for-symsinlist", cl::Hidden, cl::init(true), cl::desc("For symbols in profile symbol list, regard their profiles to " "be accurate. It may be overriden by profile-sample-accurate. "))
#define DEBUG_TYPE
static cl::opt< bool > DisableSampleLoaderInlining("disable-sample-loader-inlining", cl::Hidden, cl::init(false), cl::desc("If true, artifically skip inline transformation in sample-loader " "pass, and merge (or scale) profiles (as configured by " "--sample-profile-merge-inlinee)."))
static cl::opt< bool > ProfileSizeInline("sample-profile-inline-size", cl::Hidden, cl::init(false), cl::desc("Inline cold call sites in profile loader if it's beneficial " "for code size."))
static cl::opt< bool > SalvageStaleProfile("salvage-stale-profile", cl::Hidden, cl::init(false), cl::desc("Salvage stale profile by fuzzy matching and use the remapped " "location for sample profile query."))
static cl::opt< bool > ProfileTopDownLoad("sample-profile-top-down-load", cl::Hidden, cl::init(true), cl::desc("Do profile annotation and inlining for functions in top-down " "order of call graph during sample profile loading. It only " "works for new pass manager. "))
static cl::opt< bool > ProfileSampleAccurate("profile-sample-accurate", cl::Hidden, cl::init(false), cl::desc("If the sample profile is accurate, we will mark all un-sampled " "callsite and function as having 0 samples. Otherwise, treat " "un-sampled callsites and functions conservatively as unknown. "))
static cl::opt< bool > AllowRecursiveInline("sample-profile-recursive-inline", cl::Hidden, cl::desc("Allow sample loader inliner to inline recursive calls."))
static cl::opt< CallSiteFormat::Format > ProfileInlineReplayFormat("sample-profile-inline-replay-format", cl::init(CallSiteFormat::Format::LineColumnDiscriminator), cl::values(clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"), clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn", "<Line Number>:<Column Number>"), clEnumValN(CallSiteFormat::Format::LineDiscriminator, "LineDiscriminator", "<Line Number>.<Discriminator>"), clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator, "LineColumnDiscriminator", "<Line Number>:<Column Number>.<Discriminator> (default)")), cl::desc("How sample profile inline replay file is formatted"), cl::Hidden)
static cl::opt< std::string > SampleProfileRemappingFile("sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden)
static cl::opt< bool > CallsitePrioritizedInline("sample-profile-prioritized-inline", cl::Hidden, cl::desc("Use call site prioritized inlining for sample profile loader." "Currently only CSSPGO is supported."))
This file provides the interface for the sampled PGO loader pass.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1190
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1412
This class represents a function call, abstracting a target machine's calling convention.
Debug location.
A debug info location.
Definition: DebugLoc.h:33
unsigned getLine() const
Definition: DebugLoc.cpp:24
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Diagnostic information for the sample profiler.
Represents either an error or a value T.
Definition: ErrorOr.h:56
Class to represent profile counts.
Definition: Function.h:252
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1725
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:273
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:652
Represents the cost of inlining a function.
Definition: InlineCost.h:89
static InlineCost getNever(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:130
static InlineCost getAlways(const char *Reason, std::optional< CostBenefitPair > CostBenefit=std::nullopt)
Definition: InlineCost.h:125
static InlineCost get(int Cost, int Threshold, int StaticBonus=0)
Definition: InlineCost.h:119
This class captures the data input to the InlineFunction call, and records the auxiliary results prod...
Definition: Cloning.h:203
InlineResult is basically true or false.
Definition: InlineCost.h:179
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:933
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:365
bool extractProfTotalWeight(uint64_t &TotalVal) const
Retrieve total raw weight values of a branch.
Definition: Metadata.cpp:1639
const BasicBlock * getParent() const
Definition: Instruction.h:90
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1521
A smart pointer to a reference-counted object that inherits from RefCountedBase or ThreadSafeRefCount...
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
An analysis pass which computes the call graph for a module.
A node in the call graph.
A RefSCC of the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
iterator_range< postorder_ref_scc_iterator > postorder_ref_sccs()
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
VectorType::iterator erase(typename VectorType::iterator Iterator)
Remove the element given by Iterator.
Definition: MapVector.h:173
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:118
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:111
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Diagnostic information for optimization analysis remarks.
Diagnostic information for applied optimization remarks.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:155
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
Metadata * getMD(LLVMContext &Context, bool AddPartialField=true, bool AddPartialProfileRatioField=true)
Return summary information as metadata.
bool moduleIsProbed(const Module &M) const
bool profileIsValid(const Function &F, const FunctionSamples &Samples) const
virtual ErrorOr< uint64_t > getInstWeight(const InstructionT &Inst)
Get the weight for an instruction.
virtual const FunctionSamples * findFunctionSamples(const InstructionT &I) const
Get the FunctionSamples for an instruction.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
SampleProfileLoaderPass(std::string File="", std::string RemappingFile="", ThinOrFullLTOPhase LTOPhase=ThinOrFullLTOPhase::None, IntrusiveRefCntPtr< vfs::FileSystem > FS=nullptr)
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:809
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:111
iterator end()
Definition: StringMap.h:204
iterator find(StringRef Key)
Definition: StringMap.h:217
std::pair< iterator, bool > try_emplace(StringRef Key, ArgsTy &&...Args)
Emplace a new element for the specified key into the map if the key isn't already in the map.
Definition: StringMap.h:340
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
iterator begin() const
Definition: StringRef.h:111
StringSet - A wrapper for StringMap that provides set-like functionality.
Definition: StringSet.h:23
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
Representation of the samples collected for a function.
Definition: SampleProf.h:734
static uint64_t getGUID(StringRef Name)
Definition: SampleProf.h:1182
void findInlinedFunctions(DenseSet< GlobalValue::GUID > &S, const StringMap< Function * > &SymbolMap, uint64_t Threshold) const
Recursively traverses all children, if the total sample count of the corresponding function is no les...
Definition: SampleProf.h:1020
static StringRef getCanonicalFnName(const Function &F)
Return the canonical name for a function, taking into account suffix elision policy attributes.
Definition: SampleProf.h:1066
StringRef getFuncName() const
Return the original function name.
Definition: SampleProf.h:1053
SampleContext & getContext() const
Definition: SampleProf.h:1162
sampleprof_error merge(const FunctionSamples &Other, uint64_t Weight=1)
Merge the samples in Other into this one.
Definition: SampleProf.h:980
static LineLocation getCallSiteIdentifier(const DILocation *DIL, bool ProfileIsFS=false)
Returns a unique call site identifier for a given debug location of a call instruction.
Definition: SampleProf.cpp:221
uint64_t getHeadSamplesEstimate() const
Return an estimate of the sample count of the function entry basic block.
Definition: SampleProf.h:929
StringRef getName() const
Return the function name.
Definition: SampleProf.h:1050
const BodySampleMap & getBodySamples() const
Return all the samples collected in the body of the function.
Definition: SampleProf.h:953
static bool UseMD5
Whether the profile uses MD5 to represent string.
Definition: SampleProf.h:1167
static void flattenProfile(SampleProfileMap &ProfileMap, bool ProfileIsCS=false)
Definition: SampleProf.h:1355
bool hasAttribute(ContextAttributeMask A)
Definition: SampleProf.h:602
Sample-based profile reader.
SampleProfileMap & getProfiles()
Return all the profiles.
bool profileIsProbeBased() const
Whether input profile is based on pseudo probes.
FunctionSamples * getSamplesFor(const Function &F)
Return the samples collected for function F.
FunctionSamples * getOrCreateSamplesFor(const Function &F)
Return the samples collected for function F, create empty FunctionSamples if it doesn't exist.
bool profileIsPreInlined() const
Whether input profile contains ShouldBeInlined contexts.
std::error_code read()
The interface to read sample profiles from the associated file.
SampleProfileReaderItaniumRemapper * getRemapper()
virtual std::vector< StringRef > * getNameTable()
It includes all the names that have samples either in outline instance or inline instance.
ProfileSummary & getSummary() const
Return the profile summary.
bool profileIsCS() const
Whether input profile is fully context-sensitive.
virtual void setSkipFlatProf(bool Skip)
Don't read profile without context if the flag is set.
static ErrorOr< std::unique_ptr< SampleProfileReader > > create(const std::string Filename, LLVMContext &C, vfs::FileSystem &FS, FSDiscriminatorPass P=FSDiscriminatorPass::Base, const std::string RemapFilename="")
Create a sample profile reader appropriate to the file format.
virtual std::unique_ptr< ProfileSymbolList > getProfileSymbolList()
static const SortedCallTargetSet SortCallTargets(const CallTargetMap &Targets)
Sort call targets in descending order of call frequency.
Definition: SampleProf.h:412
static const CallTargetMap adjustCallTargets(const CallTargetMap &Targets, float DistributionFactor)
Prorate call targets by a distribution factor.
Definition: SampleProf.h:421
Enumerate the SCCs of a directed graph in reverse topological order of the SCC DAG.
Definition: SCCIterator.h:49
bool isAtEnd() const
Direct loop termination test which is more efficient than comparison with end().
Definition: SCCIterator.h:113
Sort the nodes of a directed SCC in the decreasing order of the edge weights.
Definition: SCCIterator.h:253
const CustomOperand< const MCSubtargetInfo & > Msg[]
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ FS
Definition: X86.h:208
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:703
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
void checkExpectAnnotations(Instruction &I, const ArrayRef< uint32_t > ExistingWeights, bool IsFrontend)
checkExpectAnnotations - compares PGO counters to the thresholds used for llvm.expect and warns if th...
Definition: MisExpect.cpp:202
DenseMap< SymbolStringPtr, ExecutorSymbolDef > SymbolMap
A map from symbol names (as SymbolStringPtrs) to JITSymbols (address/flags pairs).
Definition: Core.h:120
DiagnosticInfoOptimizationBase::Argument NV
CallBase & promoteIndirectCall(CallBase &CB, Function *F, uint64_t Count, uint64_t TotalCount, bool AttachProfToDirectCall, OptimizationRemarkEmitter *ORE)
std::unordered_map< SampleContext, FunctionSamples, SampleContext::Hash > SampleProfileMap
Definition: SampleProf.h:1271
std::unordered_map< LineLocation, LineLocation, LineLocationHash > LocToLocMap
Definition: SampleProf.h:727
std::map< std::string, FunctionSamples, std::less<> > FunctionSamplesMap
Definition: SampleProf.h:724
bool callsiteIsHot(const FunctionSamples *CallsiteFS, ProfileSummaryInfo *PSI, bool ProfAccForSymsInList)
Return true if the given callsite is hot wrt to hot cutoff threshold.
IntrusiveRefCntPtr< FileSystem > getRealFileSystem()
Gets an vfs::FileSystem for the 'real' file system, as seen by the operating system.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
static bool isIndirectCall(const MachineInstr &MI)
bool getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind, uint32_t MaxNumValueData, InstrProfValueData ValueData[], uint32_t &ActualNumValueData, uint64_t &TotalC, bool GetNoICPValue=false)
Extract the value profile data from Inst which is annotated with value profile meta data.
Definition: InstrProf.cpp:1066
bool isLegalToPromote(const CallBase &CB, Function *Callee, const char **FailureReason=nullptr)
Return true if the given indirect call site can be made to call Callee.
cl::opt< int > ProfileInlineLimitMin("sample-profile-inline-limit-min", cl::Hidden, cl::init(100), cl::desc("The lower bound of size growth limit for " "proirity-based sample profile loader inlining."))
cl::opt< int > ProfileInlineGrowthLimit("sample-profile-inline-growth-limit", cl::Hidden, cl::init(12), cl::desc("The size growth ratio limit for proirity-based sample profile " "loader inlining."))
scc_iterator< T > scc_begin(const T &G)
Construct the begin iterator for a deduced graph type T.
Definition: SCCIterator.h:233
void setProbeDistributionFactor(Instruction &Inst, float Factor)
Definition: PseudoProbe.cpp:78
std::string AnnotateInlinePassName(InlineContext IC)
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
Definition: Pass.h:76
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
cl::opt< bool > SampleProfileUseProfi
void annotateValueSite(Module &M, Instruction &Inst, const InstrProfRecord &InstrProfR, InstrProfValueKind ValueKind, uint32_t SiteIndx, uint32_t MaxMDCount=3)
Get the value profile data for value site SiteIdx from InstrProfR and annotate the instruction Inst w...
Definition: InstrProf.cpp:1021
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1744
llvm::cl::opt< bool > UseIterativeBFIInference
std::optional< PseudoProbe > extractProbe(const Instruction &Inst)
Definition: PseudoProbe.cpp:56
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void emitInlinedIntoBasedOnCost(OptimizationRemarkEmitter &ORE, DebugLoc DLoc, const BasicBlock *Block, const Function &Callee, const Function &Caller, const InlineCost &IC, bool ForProfileContext=false, const char *PassName=nullptr)
Emit ORE message based in cost (default heuristic).
cl::opt< bool > SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden, cl::desc("Sort profiled recursion by edge weights."))
std::unique_ptr< InlineAdvisor > getReplayInlineAdvisor(Module &M, FunctionAnalysisManager &FAM, LLVMContext &Context, std::unique_ptr< InlineAdvisor > OriginalAdvisor, const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks, InlineContext IC)
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
cl::opt< int > ProfileInlineLimitMax("sample-profile-inline-limit-max", cl::Hidden, cl::init(10000), cl::desc("The upper bound of size growth limit for " "proirity-based sample profile loader inlining."))
cl::opt< int > SampleHotCallSiteThreshold("sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000), cl::desc("Hot callsite threshold for proirity-based sample profile loader " "inlining."))
void updateProfileCallee(Function *Callee, int64_t EntryDelta, const ValueMap< const Value *, WeakTrackingVH > *VMap=nullptr)
Updates profile information by adjusting the entry count by adding EntryDelta then scaling callsite i...
InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, bool MergeAttributes=false, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true, Function *ForwardVarArgsTo=nullptr)
This function inlines the called function into the basic block of the caller.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1946
@ DS_Warning
cl::opt< bool > EnableExtTspBlockPlacement
const uint64_t NOMORE_ICP_MAGICNUM
Magic number in the value profile metadata showing a target has been promoted for the instruction and...
Definition: Metadata.h:56
cl::opt< int > SampleColdCallSiteThreshold("sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining cold callsites"))
Definition: BitVector.h:858
#define N
Used in the streaming interface as the general argument type.
Provides context on when an inline advisor is constructed in the pipeline (e.g., link phase,...
Definition: InlineAdvisor.h:60
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:205
std::optional< bool > AllowRecursiveCall
Indicate whether we allow inlining for recursive call.
Definition: InlineCost.h:238
std::optional< bool > ComputeFullInlineCost
Compute inline cost even when the cost has exceeded the threshold.
Definition: InlineCost.h:232
Represents the relative location of an instruction.
Definition: SampleProf.h:289