LLVM 20.0.0git
HotColdSplitting.cpp
Go to the documentation of this file.
1//===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// The goal of hot/cold splitting is to improve the memory locality of code.
11/// The splitting pass does this by identifying cold blocks and moving them into
12/// separate functions.
13///
14/// When the splitting pass finds a cold block (referred to as "the sink"), it
15/// grows a maximal cold region around that block. The maximal region contains
16/// all blocks (post-)dominated by the sink [*]. In theory, these blocks are as
17/// cold as the sink. Once a region is found, it's split out of the original
18/// function provided it's profitable to do so.
19///
20/// [*] In practice, there is some added complexity because some blocks are not
21/// safe to extract.
22///
23/// TODO: Use the PM to get domtrees, and preserve BFI/BPI.
24/// TODO: Reorder outlined functions.
25///
26//===----------------------------------------------------------------------===//
27
31#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/BasicBlock.h"
39#include "llvm/IR/CFG.h"
41#include "llvm/IR/Dominators.h"
43#include "llvm/IR/Function.h"
44#include "llvm/IR/Instruction.h"
46#include "llvm/IR/Module.h"
47#include "llvm/IR/PassManager.h"
49#include "llvm/IR/User.h"
50#include "llvm/IR/Value.h"
52#include "llvm/Support/Debug.h"
54#include "llvm/Transforms/IPO.h"
56#include <cassert>
57#include <limits>
58#include <string>
59
60#define DEBUG_TYPE "hotcoldsplit"
61
62STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
63STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
64
65using namespace llvm;
66
67static cl::opt<bool> EnableStaticAnalysis("hot-cold-static-analysis",
68 cl::init(true), cl::Hidden);
69
70static cl::opt<int>
71 SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden,
72 cl::desc("Base penalty for splitting cold code (as a "
73 "multiple of TCC_Basic)"));
74
76 "enable-cold-section", cl::init(false), cl::Hidden,
77 cl::desc("Enable placement of extracted cold functions"
78 " into a separate section after hot-cold splitting."));
79
81 ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"),
83 cl::desc("Name for the section containing cold functions "
84 "extracted by hot-cold splitting."));
85
87 "hotcoldsplit-max-params", cl::init(4), cl::Hidden,
88 cl::desc("Maximum number of parameters for a split function"));
89
91 "hotcoldsplit-cold-probability-denom", cl::init(100), cl::Hidden,
92 cl::desc("Divisor of cold branch probability."
93 "BranchProbability = 1/ColdBranchProbDenom"));
94
95namespace {
96// Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
97// this function unless you modify the MBB version as well.
98//
99/// A no successor, non-return block probably ends in unreachable and is cold.
100/// Also consider a block that ends in an indirect branch to be a return block,
101/// since many targets use plain indirect branches to return.
102bool blockEndsInUnreachable(const BasicBlock &BB) {
103 if (!succ_empty(&BB))
104 return false;
105 if (BB.empty())
106 return true;
107 const Instruction *I = BB.getTerminator();
108 return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
109}
110
111void analyzeProfMetadata(BasicBlock *BB,
112 BranchProbability ColdProbThresh,
113 SmallPtrSetImpl<BasicBlock *> &AnnotatedColdBlocks) {
114 // TODO: Handle branches with > 2 successors.
115 BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
116 if (!CondBr)
117 return;
118
119 uint64_t TrueWt, FalseWt;
120 if (!extractBranchWeights(*CondBr, TrueWt, FalseWt))
121 return;
122
123 auto SumWt = TrueWt + FalseWt;
124 if (SumWt == 0)
125 return;
126
127 auto TrueProb = BranchProbability::getBranchProbability(TrueWt, SumWt);
128 auto FalseProb = BranchProbability::getBranchProbability(FalseWt, SumWt);
129
130 if (TrueProb <= ColdProbThresh)
131 AnnotatedColdBlocks.insert(CondBr->getSuccessor(0));
132
133 if (FalseProb <= ColdProbThresh)
134 AnnotatedColdBlocks.insert(CondBr->getSuccessor(1));
135}
136
137bool unlikelyExecuted(BasicBlock &BB) {
138 // Exception handling blocks are unlikely executed.
139 if (BB.isEHPad() || isa<ResumeInst>(BB.getTerminator()))
140 return true;
141
142 // The block is cold if it calls/invokes a cold function. However, do not
143 // mark sanitizer traps as cold.
144 for (Instruction &I : BB)
145 if (auto *CB = dyn_cast<CallBase>(&I))
146 if (CB->hasFnAttr(Attribute::Cold) &&
147 !CB->getMetadata(LLVMContext::MD_nosanitize))
148 return true;
149
150 // The block is cold if it has an unreachable terminator, unless it's
151 // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp).
152 if (blockEndsInUnreachable(BB)) {
153 if (auto *CI =
154 dyn_cast_or_null<CallInst>(BB.getTerminator()->getPrevNode()))
155 if (CI->hasFnAttr(Attribute::NoReturn))
156 return false;
157 return true;
158 }
159
160 return false;
161}
162
163/// Check whether it's safe to outline \p BB.
164static bool mayExtractBlock(const BasicBlock &BB) {
165 // EH pads are unsafe to outline because doing so breaks EH type tables. It
166 // follows that invoke instructions cannot be extracted, because CodeExtractor
167 // requires unwind destinations to be within the extraction region.
168 //
169 // Resumes that are not reachable from a cleanup landing pad are considered to
170 // be unreachable. It’s not safe to split them out either.
171
172 if (BB.hasAddressTaken() || BB.isEHPad())
173 return false;
174 auto Term = BB.getTerminator();
175 if (isa<InvokeInst>(Term) || isa<ResumeInst>(Term))
176 return false;
177
178 // Do not outline basic blocks that have token type instructions. e.g.,
179 // exception:
180 // %0 = cleanuppad within none []
181 // call void @"?terminate@@YAXXZ"() [ "funclet"(token %0) ]
182 // br label %continue-exception
183 if (llvm::any_of(
184 BB, [](const Instruction &I) { return I.getType()->isTokenTy(); })) {
185 return false;
186 }
187
188 return true;
189}
190
191/// Mark \p F cold. Based on this assumption, also optimize it for minimum size.
192/// If \p UpdateEntryCount is true (set when this is a new split function and
193/// module has profile data), set entry count to 0 to ensure treated as cold.
194/// Return true if the function is changed.
195static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) {
196 assert(!F.hasOptNone() && "Can't mark this cold");
197 bool Changed = false;
198 if (!F.hasFnAttribute(Attribute::Cold)) {
199 F.addFnAttr(Attribute::Cold);
200 Changed = true;
201 }
202 if (!F.hasFnAttribute(Attribute::MinSize)) {
203 F.addFnAttr(Attribute::MinSize);
204 Changed = true;
205 }
206 if (UpdateEntryCount) {
207 // Set the entry count to 0 to ensure it is placed in the unlikely text
208 // section when function sections are enabled.
209 F.setEntryCount(0);
210 Changed = true;
211 }
212
213 return Changed;
214}
215
216} // end anonymous namespace
217
218/// Check whether \p F is inherently cold.
219bool HotColdSplitting::isFunctionCold(const Function &F) const {
220 if (F.hasFnAttribute(Attribute::Cold))
221 return true;
222
223 if (F.getCallingConv() == CallingConv::Cold)
224 return true;
225
226 if (PSI->isFunctionEntryCold(&F))
227 return true;
228
229 return false;
230}
231
232bool HotColdSplitting::isBasicBlockCold(
233 BasicBlock *BB, BranchProbability ColdProbThresh,
234 SmallPtrSetImpl<BasicBlock *> &AnnotatedColdBlocks,
235 BlockFrequencyInfo *BFI) const {
236 if (BFI) {
237 if (PSI->isColdBlock(BB, BFI))
238 return true;
239 } else {
240 // Find cold blocks of successors of BB during a reverse postorder traversal.
241 analyzeProfMetadata(BB, ColdProbThresh, AnnotatedColdBlocks);
242
243 // A statically cold BB would be known before it is visited
244 // because the prof-data of incoming edges are 'analyzed' as part of RPOT.
245 if (AnnotatedColdBlocks.count(BB))
246 return true;
247 }
248
249 if (EnableStaticAnalysis && unlikelyExecuted(*BB))
250 return true;
251
252 return false;
253}
254
255// Returns false if the function should not be considered for hot-cold split
256// optimization.
257bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
258 if (F.hasFnAttribute(Attribute::AlwaysInline))
259 return false;
260
261 if (F.hasFnAttribute(Attribute::NoInline))
262 return false;
263
264 // A function marked `noreturn` may contain unreachable terminators: these
265 // should not be considered cold, as the function may be a trampoline.
266 if (F.hasFnAttribute(Attribute::NoReturn))
267 return false;
268
269 if (F.hasFnAttribute(Attribute::SanitizeAddress) ||
270 F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
271 F.hasFnAttribute(Attribute::SanitizeThread) ||
272 F.hasFnAttribute(Attribute::SanitizeMemory))
273 return false;
274
275 // Do not outline scoped EH personality functions.
276 if (F.hasPersonalityFn())
277 if (isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
278 return false;
279
280 return true;
281}
282
283/// Get the benefit score of outlining \p Region.
286 // Sum up the code size costs of non-terminator instructions. Tight coupling
287 // with \ref getOutliningPenalty is needed to model the costs of terminators.
288 InstructionCost Benefit = 0;
289 for (BasicBlock *BB : Region)
291 if (&I != BB->getTerminator())
292 Benefit +=
294
295 return Benefit;
296}
297
298/// Get the penalty score for outlining \p Region.
300 unsigned NumInputs, unsigned NumOutputs) {
301 int Penalty = SplittingThreshold;
302 LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n");
303
304 // If the splitting threshold is set at or below zero, skip the usual
305 // profitability check.
306 if (SplittingThreshold <= 0)
307 return Penalty;
308
309 // Find the number of distinct exit blocks for the region. Use a conservative
310 // check to determine whether control returns from the region.
311 bool NoBlocksReturn = true;
312 SmallPtrSet<BasicBlock *, 2> SuccsOutsideRegion;
313 for (BasicBlock *BB : Region) {
314 // If a block has no successors, only assume it does not return if it's
315 // unreachable.
316 if (succ_empty(BB)) {
317 NoBlocksReturn &= isa<UnreachableInst>(BB->getTerminator());
318 continue;
319 }
320
321 for (BasicBlock *SuccBB : successors(BB)) {
322 if (!is_contained(Region, SuccBB)) {
323 NoBlocksReturn = false;
324 SuccsOutsideRegion.insert(SuccBB);
325 }
326 }
327 }
328
329 // Count the number of phis in exit blocks with >= 2 incoming values from the
330 // outlining region. These phis are split (\ref severSplitPHINodesOfExits),
331 // and new outputs are created to supply the split phis. CodeExtractor can't
332 // report these new outputs until extraction begins, but it's important to
333 // factor the cost of the outputs into the cost calculation.
334 unsigned NumSplitExitPhis = 0;
335 for (BasicBlock *ExitBB : SuccsOutsideRegion) {
336 for (PHINode &PN : ExitBB->phis()) {
337 // Find all incoming values from the outlining region.
338 int NumIncomingVals = 0;
339 for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
340 if (llvm::is_contained(Region, PN.getIncomingBlock(i))) {
341 ++NumIncomingVals;
342 if (NumIncomingVals > 1) {
343 ++NumSplitExitPhis;
344 break;
345 }
346 }
347 }
348 }
349
350 // Apply a penalty for calling the split function. Factor in the cost of
351 // materializing all of the parameters.
352 int NumOutputsAndSplitPhis = NumOutputs + NumSplitExitPhis;
353 int NumParams = NumInputs + NumOutputsAndSplitPhis;
354 if (NumParams > MaxParametersForSplit) {
355 LLVM_DEBUG(dbgs() << NumInputs << " inputs and " << NumOutputsAndSplitPhis
356 << " outputs exceeds parameter limit ("
357 << MaxParametersForSplit << ")\n");
358 return std::numeric_limits<int>::max();
359 }
360 const int CostForArgMaterialization = 2 * TargetTransformInfo::TCC_Basic;
361 LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumParams << " params\n");
362 Penalty += CostForArgMaterialization * NumParams;
363
364 // Apply the typical code size cost for an output alloca and its associated
365 // reload in the caller. Also penalize the associated store in the callee.
366 LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputsAndSplitPhis
367 << " outputs/split phis\n");
368 const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
369 Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
370
371 // Apply a `noreturn` bonus.
372 if (NoBlocksReturn) {
373 LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
374 << " non-returning terminators\n");
375 Penalty -= Region.size();
376 }
377
378 // Apply a penalty for having more than one successor outside of the region.
379 // This penalty accounts for the switch needed in the caller.
380 if (SuccsOutsideRegion.size() > 1) {
381 LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
382 << " non-region successors\n");
383 Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
384 }
385
386 return Penalty;
387}
388
389// Determine if it is beneficial to split the \p Region.
390bool HotColdSplitting::isSplittingBeneficial(CodeExtractor &CE,
391 const BlockSequence &Region,
393 assert(!Region.empty());
394
395 // Perform a simple cost/benefit analysis to decide whether or not to permit
396 // splitting.
397 SetVector<Value *> Inputs, Outputs, Sinks;
398 CE.findInputsOutputs(Inputs, Outputs, Sinks);
399 InstructionCost OutliningBenefit = getOutliningBenefit(Region, TTI);
400 int OutliningPenalty =
401 getOutliningPenalty(Region, Inputs.size(), Outputs.size());
402 LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
403 << ", penalty = " << OutliningPenalty << "\n");
404 if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty)
405 return false;
406
407 return true;
408}
409
410// Split the single \p EntryPoint cold region. \p CE is the region code
411// extractor.
412Function *HotColdSplitting::extractColdRegion(
413 BasicBlock &EntryPoint, CodeExtractor &CE,
416 Function *OrigF = EntryPoint.getParent();
417 if (Function *OutF = CE.extractCodeRegion(CEAC)) {
418 User *U = *OutF->user_begin();
419 CallInst *CI = cast<CallInst>(U);
420 NumColdRegionsOutlined++;
421 if (TTI.useColdCCForColdCall(*OutF)) {
422 OutF->setCallingConv(CallingConv::Cold);
424 }
425 CI->setIsNoInline();
426
428 OutF->setSection(ColdSectionName);
429 else {
430 if (OrigF->hasSection())
431 OutF->setSection(OrigF->getSection());
432 }
433
434 markFunctionCold(*OutF, BFI != nullptr);
435
436 LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
437 ORE.emit([&]() {
438 return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
439 &*EntryPoint.begin())
440 << ore::NV("Original", OrigF) << " split cold code into "
441 << ore::NV("Split", OutF);
442 });
443 return OutF;
444 }
445
446 ORE.emit([&]() {
447 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
448 &*EntryPoint.begin())
449 << "Failed to extract region at block "
450 << ore::NV("Block", &EntryPoint);
451 });
452 return nullptr;
453}
454
455/// A pair of (basic block, score).
456using BlockTy = std::pair<BasicBlock *, unsigned>;
457
458namespace {
459/// A maximal outlining region. This contains all blocks post-dominated by a
460/// sink block, the sink block itself, and all blocks dominated by the sink.
461/// If sink-predecessors and sink-successors cannot be extracted in one region,
462/// the static constructor returns a list of suitable extraction regions.
463class OutliningRegion {
464 /// A list of (block, score) pairs. A block's score is non-zero iff it's a
465 /// viable sub-region entry point. Blocks with higher scores are better entry
466 /// points (i.e. they are more distant ancestors of the sink block).
468
469 /// The suggested entry point into the region. If the region has multiple
470 /// entry points, all blocks within the region may not be reachable from this
471 /// entry point.
472 BasicBlock *SuggestedEntryPoint = nullptr;
473
474 /// Whether the entire function is cold.
475 bool EntireFunctionCold = false;
476
477 /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise.
478 static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) {
479 return mayExtractBlock(BB) ? Score : 0;
480 }
481
482 /// These scores should be lower than the score for predecessor blocks,
483 /// because regions starting at predecessor blocks are typically larger.
484 static constexpr unsigned ScoreForSuccBlock = 1;
485 static constexpr unsigned ScoreForSinkBlock = 1;
486
487 OutliningRegion(const OutliningRegion &) = delete;
488 OutliningRegion &operator=(const OutliningRegion &) = delete;
489
490public:
491 OutliningRegion() = default;
492 OutliningRegion(OutliningRegion &&) = default;
493 OutliningRegion &operator=(OutliningRegion &&) = default;
494
495 static std::vector<OutliningRegion> create(BasicBlock &SinkBB,
496 const DominatorTree &DT,
497 const PostDominatorTree &PDT) {
498 std::vector<OutliningRegion> Regions;
499 SmallPtrSet<BasicBlock *, 4> RegionBlocks;
500
501 Regions.emplace_back();
502 OutliningRegion *ColdRegion = &Regions.back();
503
504 auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) {
505 RegionBlocks.insert(BB);
506 ColdRegion->Blocks.emplace_back(BB, Score);
507 };
508
509 // The ancestor farthest-away from SinkBB, and also post-dominated by it.
510 unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock);
511 ColdRegion->SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr;
512 unsigned BestScore = SinkScore;
513
514 // Visit SinkBB's ancestors using inverse DFS.
515 auto PredIt = ++idf_begin(&SinkBB);
516 auto PredEnd = idf_end(&SinkBB);
517 while (PredIt != PredEnd) {
518 BasicBlock &PredBB = **PredIt;
519 bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
520
521 // If the predecessor is cold and has no predecessors, the entire
522 // function must be cold.
523 if (SinkPostDom && pred_empty(&PredBB)) {
524 ColdRegion->EntireFunctionCold = true;
525 return Regions;
526 }
527
528 // If SinkBB does not post-dominate a predecessor, do not mark the
529 // predecessor (or any of its predecessors) cold.
530 if (!SinkPostDom || !mayExtractBlock(PredBB)) {
531 PredIt.skipChildren();
532 continue;
533 }
534
535 // Keep track of the post-dominated ancestor farthest away from the sink.
536 // The path length is always >= 2, ensuring that predecessor blocks are
537 // considered as entry points before the sink block.
538 unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength());
539 if (PredScore > BestScore) {
540 ColdRegion->SuggestedEntryPoint = &PredBB;
541 BestScore = PredScore;
542 }
543
544 addBlockToRegion(&PredBB, PredScore);
545 ++PredIt;
546 }
547
548 // If the sink can be added to the cold region, do so. It's considered as
549 // an entry point before any sink-successor blocks.
550 //
551 // Otherwise, split cold sink-successor blocks using a separate region.
552 // This satisfies the requirement that all extraction blocks other than the
553 // first have predecessors within the extraction region.
554 if (mayExtractBlock(SinkBB)) {
555 addBlockToRegion(&SinkBB, SinkScore);
556 if (pred_empty(&SinkBB)) {
557 ColdRegion->EntireFunctionCold = true;
558 return Regions;
559 }
560 } else {
561 Regions.emplace_back();
562 ColdRegion = &Regions.back();
563 BestScore = 0;
564 }
565
566 // Find all successors of SinkBB dominated by SinkBB using DFS.
567 auto SuccIt = ++df_begin(&SinkBB);
568 auto SuccEnd = df_end(&SinkBB);
569 while (SuccIt != SuccEnd) {
570 BasicBlock &SuccBB = **SuccIt;
571 bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
572
573 // Don't allow the backwards & forwards DFSes to mark the same block.
574 bool DuplicateBlock = RegionBlocks.count(&SuccBB);
575
576 // If SinkBB does not dominate a successor, do not mark the successor (or
577 // any of its successors) cold.
578 if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) {
579 SuccIt.skipChildren();
580 continue;
581 }
582
583 unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock);
584 if (SuccScore > BestScore) {
585 ColdRegion->SuggestedEntryPoint = &SuccBB;
586 BestScore = SuccScore;
587 }
588
589 addBlockToRegion(&SuccBB, SuccScore);
590 ++SuccIt;
591 }
592
593 return Regions;
594 }
595
596 /// Whether this region has nothing to extract.
597 bool empty() const { return !SuggestedEntryPoint; }
598
599 /// The blocks in this region.
601
602 /// Whether the entire function containing this region is cold.
603 bool isEntireFunctionCold() const { return EntireFunctionCold; }
604
605 /// Remove a sub-region from this region and return it as a block sequence.
606 BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) {
607 assert(!empty() && !isEntireFunctionCold() && "Nothing to extract");
608
609 // Remove blocks dominated by the suggested entry point from this region.
610 // During the removal, identify the next best entry point into the region.
611 // Ensure that the first extracted block is the suggested entry point.
612 BlockSequence SubRegion = {SuggestedEntryPoint};
613 BasicBlock *NextEntryPoint = nullptr;
614 unsigned NextScore = 0;
615 auto RegionEndIt = Blocks.end();
616 auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) {
617 BasicBlock *BB = Block.first;
618 unsigned Score = Block.second;
619 bool InSubRegion =
620 BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB);
621 if (!InSubRegion && Score > NextScore) {
622 NextEntryPoint = BB;
623 NextScore = Score;
624 }
625 if (InSubRegion && BB != SuggestedEntryPoint)
626 SubRegion.push_back(BB);
627 return InSubRegion;
628 });
629 Blocks.erase(RegionStartIt, RegionEndIt);
630
631 // Update the suggested entry point.
632 SuggestedEntryPoint = NextEntryPoint;
633
634 return SubRegion;
635 }
636};
637} // namespace
638
639bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
640 // The set of cold blocks outlined.
642
643 // The set of cold blocks cannot be outlined.
644 SmallPtrSet<BasicBlock *, 4> CannotBeOutlinedColdBlocks;
645
646 // Set of cold blocks obtained with RPOT.
647 SmallPtrSet<BasicBlock *, 4> AnnotatedColdBlocks;
648
649 // The worklist of non-intersecting regions left to outline. The first member
650 // of the pair is the entry point into the region to be outlined.
652
653 // Set up an RPO traversal. Experimentally, this performs better (outlines
654 // more) than a PO traversal, because we prevent region overlap by keeping
655 // the first region to contain a block.
657
658 // Calculate domtrees lazily. This reduces compile-time significantly.
659 std::unique_ptr<DominatorTree> DT;
660 std::unique_ptr<PostDominatorTree> PDT;
661
662 // Calculate BFI lazily (it's only used to query ProfileSummaryInfo). This
663 // reduces compile-time significantly. TODO: When we *do* use BFI, we should
664 // be able to salvage its domtrees instead of recomputing them.
665 BlockFrequencyInfo *BFI = nullptr;
666 if (HasProfileSummary)
667 BFI = GetBFI(F);
668
669 TargetTransformInfo &TTI = GetTTI(F);
670 OptimizationRemarkEmitter &ORE = (*GetORE)(F);
671 AssumptionCache *AC = LookupAC(F);
672 auto ColdProbThresh = TTI.getPredictableBranchThreshold().getCompl();
673
674 if (ColdBranchProbDenom.getNumOccurrences())
675 ColdProbThresh = BranchProbability(1, ColdBranchProbDenom.getValue());
676
677 unsigned OutlinedFunctionID = 1;
678 // Find all cold regions.
679 for (BasicBlock *BB : RPOT) {
680 // This block is already part of some outlining region.
681 if (ColdBlocks.count(BB))
682 continue;
683
684 // This block is already part of some region cannot be outlined.
685 if (CannotBeOutlinedColdBlocks.count(BB))
686 continue;
687
688 if (!isBasicBlockCold(BB, ColdProbThresh, AnnotatedColdBlocks, BFI))
689 continue;
690
691 LLVM_DEBUG({
692 dbgs() << "Found a cold block:\n";
693 BB->dump();
694 });
695
696 if (!DT)
697 DT = std::make_unique<DominatorTree>(F);
698 if (!PDT)
699 PDT = std::make_unique<PostDominatorTree>(F);
700
701 auto Regions = OutliningRegion::create(*BB, *DT, *PDT);
702 for (OutliningRegion &Region : Regions) {
703 if (Region.empty())
704 continue;
705
706 if (Region.isEntireFunctionCold()) {
707 LLVM_DEBUG(dbgs() << "Entire function is cold\n");
708 return markFunctionCold(F);
709 }
710
711 do {
712 BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
713 LLVM_DEBUG({
714 dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
715 for (BasicBlock *BB : SubRegion)
716 BB->dump();
717 });
718
719 // TODO: Pass BFI and BPI to update profile information.
721 SubRegion, &*DT, /* AggregateArgs */ false, /* BFI */ nullptr,
722 /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
723 /* AllowAlloca */ false, /* AllocaBlock */ nullptr,
724 /* Suffix */ "cold." + std::to_string(OutlinedFunctionID));
725
726 if (CE.isEligible() && isSplittingBeneficial(CE, SubRegion, TTI) &&
727 // If this outlining region intersects with another, drop the new
728 // region.
729 //
730 // TODO: It's theoretically possible to outline more by only keeping
731 // the largest region which contains a block, but the extra
732 // bookkeeping to do this is tricky/expensive.
733 none_of(SubRegion, [&](BasicBlock *Block) {
734 return ColdBlocks.contains(Block);
735 })) {
736 ColdBlocks.insert(SubRegion.begin(), SubRegion.end());
737
738 LLVM_DEBUG({
739 for (auto *Block : SubRegion)
740 dbgs() << " contains cold block:" << Block->getName() << "\n";
741 });
742
743 OutliningWorklist.emplace_back(
744 std::make_pair(SubRegion[0], std::move(CE)));
745 ++OutlinedFunctionID;
746 } else {
747 // The cold block region cannot be outlined.
748 for (auto *Block : SubRegion)
749 if ((DT->dominates(BB, Block) && PDT->dominates(Block, BB)) ||
750 (PDT->dominates(BB, Block) && DT->dominates(Block, BB)))
751 // Will skip this cold block in the loop to save the compile time
752 CannotBeOutlinedColdBlocks.insert(Block);
753 }
754 } while (!Region.empty());
755
756 ++NumColdRegionsFound;
757 }
758 }
759
760 if (OutliningWorklist.empty())
761 return false;
762
763 // Outline single-entry cold regions, splitting up larger regions as needed.
764 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
766 for (auto &BCE : OutliningWorklist) {
767 Function *Outlined =
768 extractColdRegion(*BCE.first, BCE.second, CEAC, BFI, TTI, ORE);
769 assert(Outlined && "Should be outlined");
770 (void)Outlined;
771 }
772
773 return true;
774}
775
777 bool Changed = false;
778 bool HasProfileSummary = (M.getProfileSummary(/* IsCS */ false) != nullptr);
779 for (Function &F : M) {
780 // Do not touch declarations.
781 if (F.isDeclaration())
782 continue;
783
784 // Do not modify `optnone` functions.
785 if (F.hasOptNone())
786 continue;
787
788 // Detect inherently cold functions and mark them as such.
789 if (isFunctionCold(F)) {
790 Changed |= markFunctionCold(F);
791 continue;
792 }
793
794 if (!shouldOutlineFrom(F)) {
795 LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n");
796 continue;
797 }
798
799 LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
800 Changed |= outlineColdRegions(F, HasProfileSummary);
801 }
802 return Changed;
803}
804
807 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
808
809 auto LookupAC = [&FAM](Function &F) -> AssumptionCache * {
811 };
812
813 auto GBFI = [&FAM](Function &F) {
815 };
816
817 std::function<TargetTransformInfo &(Function &)> GTTI =
820 };
821
822 std::unique_ptr<OptimizationRemarkEmitter> ORE;
823 std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
824 [&ORE](Function &F) -> OptimizationRemarkEmitter & {
825 ORE.reset(new OptimizationRemarkEmitter(&F));
826 return *ORE;
827 };
828
830
831 if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M))
833 return PreservedAnalyses::all();
834}
bbsections Prepares for basic block by splitting functions into clusters of basic blocks
static bool blockEndsInUnreachable(const MachineBasicBlock *MBB)
A no successor, non-return block probably ends in unreachable and is cold.
#define LLVM_DEBUG(...)
Definition: Debug.h:106
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
#define DEBUG_TYPE
std::pair< BasicBlock *, unsigned > BlockTy
A pair of (basic block, score).
static cl::opt< int > SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden, cl::desc("Base penalty for splitting cold code (as a " "multiple of TCC_Basic)"))
static cl::opt< std::string > ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"), cl::Hidden, cl::desc("Name for the section containing cold functions " "extracted by hot-cold splitting."))
static cl::opt< int > ColdBranchProbDenom("hotcoldsplit-cold-probability-denom", cl::init(100), cl::Hidden, cl::desc("Divisor of cold branch probability." "BranchProbability = 1/ColdBranchProbDenom"))
static cl::opt< int > MaxParametersForSplit("hotcoldsplit-max-params", cl::init(4), cl::Hidden, cl::desc("Maximum number of parameters for a split function"))
static InstructionCost getOutliningBenefit(ArrayRef< BasicBlock * > Region, TargetTransformInfo &TTI)
Get the benefit score of outlining Region.
static cl::opt< bool > EnableColdSection("enable-cold-section", cl::init(false), cl::Hidden, cl::desc("Enable placement of extracted cold functions" " into a separate section after hot-cold splitting."))
static cl::opt< bool > EnableStaticAnalysis("hot-cold-static-analysis", cl::init(true), cl::Hidden)
static int getOutliningPenalty(ArrayRef< BasicBlock * > Region, unsigned NumInputs, unsigned NumOutputs)
Get the penalty score for outlining Region.
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
FunctionAnalysisManager FAM
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
This file contains the declarations for profiling metadata utility functions.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
This pass exposes codegen information to IR-level passes.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:429
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:250
bool empty() const
Definition: BasicBlock.h:470
bool hasAddressTaken() const
Returns true if there are any uses of this basic block other than direct branches,...
Definition: BasicBlock.h:658
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:675
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
BasicBlock * getSuccessor(unsigned i) const
static BranchProbability getBranchProbability(uint64_t Numerator, uint64_t Denominator)
BranchProbability getCompl() const
void setCallingConv(CallingConv::ID CC)
Definition: InstrTypes.h:1411
void setIsNoInline()
Definition: InstrTypes.h:1883
This class represents a function call, abstracting a target machine's calling convention.
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
StringRef getSection() const
Get the custom section of this global if it has one.
Definition: GlobalObject.h:117
bool hasSection() const
Check if this global has a custom object file section.
Definition: GlobalObject.h:109
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:567
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
bool dominates(const Instruction *I1, const Instruction *I2) const
Return true if I1 dominates I2.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:114
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool isColdBlock(const BBType *BB, BFIT *BFI) const
Returns true if BasicBlock BB is considered cold.
bool isFunctionEntryCold(const Function *F) const
Returns true if F has cold function entry.
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
@ TCK_CodeSize
Instruction code size.
BranchProbability getPredictableBranchThreshold() const
If a branch or a select condition is skewed in one direction by more than this factor,...
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
bool useColdCCForColdCall(Function &F) const
Return true if the input function which is cold at all call sites, should use coldcc calling conventi...
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:5304
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ CE
Windows NT (Windows on ARM)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool succ_empty(const Instruction *I)
Definition: CFG.h:255
auto successors(const MachineBasicBlock *BB)
df_iterator< T > df_begin(const T &G)
bool isScopedEHPersonality(EHPersonality Pers)
Returns true if this personality uses scope-style EH IR instructions: catchswitch,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
idf_iterator< T > idf_end(const T &G)
auto remove_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1778
idf_iterator< T > idf_begin(const T &G)
bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
bool pred_empty(const BasicBlock *BB)
Definition: CFG.h:118
df_iterator< T > df_end(const T &G)