LLVM 17.0.0git
HotColdSplitting.cpp
Go to the documentation of this file.
1//===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// The goal of hot/cold splitting is to improve the memory locality of code.
11/// The splitting pass does this by identifying cold blocks and moving them into
12/// separate functions.
13///
14/// When the splitting pass finds a cold block (referred to as "the sink"), it
15/// grows a maximal cold region around that block. The maximal region contains
16/// all blocks (post-)dominated by the sink [*]. In theory, these blocks are as
17/// cold as the sink. Once a region is found, it's split out of the original
18/// function provided it's profitable to do so.
19///
20/// [*] In practice, there is some added complexity because some blocks are not
21/// safe to extract.
22///
23/// TODO: Use the PM to get domtrees, and preserve BFI/BPI.
24/// TODO: Reorder outlined functions.
25///
26//===----------------------------------------------------------------------===//
27
31#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/BasicBlock.h"
39#include "llvm/IR/CFG.h"
41#include "llvm/IR/Dominators.h"
42#include "llvm/IR/Function.h"
43#include "llvm/IR/Instruction.h"
45#include "llvm/IR/Module.h"
46#include "llvm/IR/PassManager.h"
47#include "llvm/IR/User.h"
48#include "llvm/IR/Value.h"
50#include "llvm/Pass.h"
52#include "llvm/Support/Debug.h"
54#include "llvm/Transforms/IPO.h"
56#include <algorithm>
57#include <cassert>
58#include <limits>
59#include <string>
60
61#define DEBUG_TYPE "hotcoldsplit"
62
63STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
64STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
65
66using namespace llvm;
67
68static cl::opt<bool> EnableStaticAnalysis("hot-cold-static-analysis",
69 cl::init(true), cl::Hidden);
70
71static cl::opt<int>
72 SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden,
73 cl::desc("Base penalty for splitting cold code (as a "
74 "multiple of TCC_Basic)"));
75
77 "enable-cold-section", cl::init(false), cl::Hidden,
78 cl::desc("Enable placement of extracted cold functions"
79 " into a separate section after hot-cold splitting."));
80
82 ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"),
84 cl::desc("Name for the section containing cold functions "
85 "extracted by hot-cold splitting."));
86
88 "hotcoldsplit-max-params", cl::init(4), cl::Hidden,
89 cl::desc("Maximum number of parameters for a split function"));
90
91namespace {
92// Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
93// this function unless you modify the MBB version as well.
94//
95/// A no successor, non-return block probably ends in unreachable and is cold.
96/// Also consider a block that ends in an indirect branch to be a return block,
97/// since many targets use plain indirect branches to return.
98bool blockEndsInUnreachable(const BasicBlock &BB) {
99 if (!succ_empty(&BB))
100 return false;
101 if (BB.empty())
102 return true;
103 const Instruction *I = BB.getTerminator();
104 return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
105}
106
107bool unlikelyExecuted(BasicBlock &BB) {
108 // Exception handling blocks are unlikely executed.
109 if (BB.isEHPad() || isa<ResumeInst>(BB.getTerminator()))
110 return true;
111
112 // The block is cold if it calls/invokes a cold function. However, do not
113 // mark sanitizer traps as cold.
114 for (Instruction &I : BB)
115 if (auto *CB = dyn_cast<CallBase>(&I))
116 if (CB->hasFnAttr(Attribute::Cold) &&
117 !CB->getMetadata(LLVMContext::MD_nosanitize))
118 return true;
119
120 // The block is cold if it has an unreachable terminator, unless it's
121 // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp).
122 if (blockEndsInUnreachable(BB)) {
123 if (auto *CI =
124 dyn_cast_or_null<CallInst>(BB.getTerminator()->getPrevNode()))
125 if (CI->hasFnAttr(Attribute::NoReturn))
126 return false;
127 return true;
128 }
129
130 return false;
131}
132
133/// Check whether it's safe to outline \p BB.
134static bool mayExtractBlock(const BasicBlock &BB) {
135 // EH pads are unsafe to outline because doing so breaks EH type tables. It
136 // follows that invoke instructions cannot be extracted, because CodeExtractor
137 // requires unwind destinations to be within the extraction region.
138 //
139 // Resumes that are not reachable from a cleanup landing pad are considered to
140 // be unreachable. It’s not safe to split them out either.
141 if (BB.hasAddressTaken() || BB.isEHPad())
142 return false;
143 auto Term = BB.getTerminator();
144 return !isa<InvokeInst>(Term) && !isa<ResumeInst>(Term);
145}
146
147/// Mark \p F cold. Based on this assumption, also optimize it for minimum size.
148/// If \p UpdateEntryCount is true (set when this is a new split function and
149/// module has profile data), set entry count to 0 to ensure treated as cold.
150/// Return true if the function is changed.
151static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) {
152 assert(!F.hasOptNone() && "Can't mark this cold");
153 bool Changed = false;
154 if (!F.hasFnAttribute(Attribute::Cold)) {
155 F.addFnAttr(Attribute::Cold);
156 Changed = true;
157 }
158 if (!F.hasFnAttribute(Attribute::MinSize)) {
159 F.addFnAttr(Attribute::MinSize);
160 Changed = true;
161 }
162 if (UpdateEntryCount) {
163 // Set the entry count to 0 to ensure it is placed in the unlikely text
164 // section when function sections are enabled.
165 F.setEntryCount(0);
166 Changed = true;
167 }
168
169 return Changed;
170}
171
172class HotColdSplittingLegacyPass : public ModulePass {
173public:
174 static char ID;
175 HotColdSplittingLegacyPass() : ModulePass(ID) {
177 }
178
179 void getAnalysisUsage(AnalysisUsage &AU) const override {
184 }
185
186 bool runOnModule(Module &M) override;
187};
188
189} // end anonymous namespace
190
191/// Check whether \p F is inherently cold.
192bool HotColdSplitting::isFunctionCold(const Function &F) const {
193 if (F.hasFnAttribute(Attribute::Cold))
194 return true;
195
196 if (F.getCallingConv() == CallingConv::Cold)
197 return true;
198
199 if (PSI->isFunctionEntryCold(&F))
200 return true;
201
202 return false;
203}
204
205// Returns false if the function should not be considered for hot-cold split
206// optimization.
207bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
208 if (F.hasFnAttribute(Attribute::AlwaysInline))
209 return false;
210
211 if (F.hasFnAttribute(Attribute::NoInline))
212 return false;
213
214 // A function marked `noreturn` may contain unreachable terminators: these
215 // should not be considered cold, as the function may be a trampoline.
216 if (F.hasFnAttribute(Attribute::NoReturn))
217 return false;
218
219 if (F.hasFnAttribute(Attribute::SanitizeAddress) ||
220 F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
221 F.hasFnAttribute(Attribute::SanitizeThread) ||
222 F.hasFnAttribute(Attribute::SanitizeMemory))
223 return false;
224
225 return true;
226}
227
228/// Get the benefit score of outlining \p Region.
231 // Sum up the code size costs of non-terminator instructions. Tight coupling
232 // with \ref getOutliningPenalty is needed to model the costs of terminators.
233 InstructionCost Benefit = 0;
234 for (BasicBlock *BB : Region)
236 if (&I != BB->getTerminator())
237 Benefit +=
239
240 return Benefit;
241}
242
243/// Get the penalty score for outlining \p Region.
245 unsigned NumInputs, unsigned NumOutputs) {
246 int Penalty = SplittingThreshold;
247 LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n");
248
249 // If the splitting threshold is set at or below zero, skip the usual
250 // profitability check.
251 if (SplittingThreshold <= 0)
252 return Penalty;
253
254 // Find the number of distinct exit blocks for the region. Use a conservative
255 // check to determine whether control returns from the region.
256 bool NoBlocksReturn = true;
257 SmallPtrSet<BasicBlock *, 2> SuccsOutsideRegion;
258 for (BasicBlock *BB : Region) {
259 // If a block has no successors, only assume it does not return if it's
260 // unreachable.
261 if (succ_empty(BB)) {
262 NoBlocksReturn &= isa<UnreachableInst>(BB->getTerminator());
263 continue;
264 }
265
266 for (BasicBlock *SuccBB : successors(BB)) {
267 if (!is_contained(Region, SuccBB)) {
268 NoBlocksReturn = false;
269 SuccsOutsideRegion.insert(SuccBB);
270 }
271 }
272 }
273
274 // Count the number of phis in exit blocks with >= 2 incoming values from the
275 // outlining region. These phis are split (\ref severSplitPHINodesOfExits),
276 // and new outputs are created to supply the split phis. CodeExtractor can't
277 // report these new outputs until extraction begins, but it's important to
278 // factor the cost of the outputs into the cost calculation.
279 unsigned NumSplitExitPhis = 0;
280 for (BasicBlock *ExitBB : SuccsOutsideRegion) {
281 for (PHINode &PN : ExitBB->phis()) {
282 // Find all incoming values from the outlining region.
283 int NumIncomingVals = 0;
284 for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
285 if (llvm::is_contained(Region, PN.getIncomingBlock(i))) {
286 ++NumIncomingVals;
287 if (NumIncomingVals > 1) {
288 ++NumSplitExitPhis;
289 break;
290 }
291 }
292 }
293 }
294
295 // Apply a penalty for calling the split function. Factor in the cost of
296 // materializing all of the parameters.
297 int NumOutputsAndSplitPhis = NumOutputs + NumSplitExitPhis;
298 int NumParams = NumInputs + NumOutputsAndSplitPhis;
299 if (NumParams > MaxParametersForSplit) {
300 LLVM_DEBUG(dbgs() << NumInputs << " inputs and " << NumOutputsAndSplitPhis
301 << " outputs exceeds parameter limit ("
302 << MaxParametersForSplit << ")\n");
303 return std::numeric_limits<int>::max();
304 }
305 const int CostForArgMaterialization = 2 * TargetTransformInfo::TCC_Basic;
306 LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumParams << " params\n");
307 Penalty += CostForArgMaterialization * NumParams;
308
309 // Apply the typical code size cost for an output alloca and its associated
310 // reload in the caller. Also penalize the associated store in the callee.
311 LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputsAndSplitPhis
312 << " outputs/split phis\n");
313 const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
314 Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
315
316 // Apply a `noreturn` bonus.
317 if (NoBlocksReturn) {
318 LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
319 << " non-returning terminators\n");
320 Penalty -= Region.size();
321 }
322
323 // Apply a penalty for having more than one successor outside of the region.
324 // This penalty accounts for the switch needed in the caller.
325 if (SuccsOutsideRegion.size() > 1) {
326 LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
327 << " non-region successors\n");
328 Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
329 }
330
331 return Penalty;
332}
333
334Function *HotColdSplitting::extractColdRegion(
337 OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
338 assert(!Region.empty());
339
340 // TODO: Pass BFI and BPI to update profile information.
341 CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
342 /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
343 /* AllowAlloca */ false, /* AllocaBlock */ nullptr,
344 /* Suffix */ "cold." + std::to_string(Count));
345
346 // Perform a simple cost/benefit analysis to decide whether or not to permit
347 // splitting.
348 SetVector<Value *> Inputs, Outputs, Sinks;
349 CE.findInputsOutputs(Inputs, Outputs, Sinks);
350 InstructionCost OutliningBenefit = getOutliningBenefit(Region, TTI);
351 int OutliningPenalty =
352 getOutliningPenalty(Region, Inputs.size(), Outputs.size());
353 LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
354 << ", penalty = " << OutliningPenalty << "\n");
355 if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty)
356 return nullptr;
357
358 Function *OrigF = Region[0]->getParent();
359 if (Function *OutF = CE.extractCodeRegion(CEAC)) {
360 User *U = *OutF->user_begin();
361 CallInst *CI = cast<CallInst>(U);
362 NumColdRegionsOutlined++;
363 if (TTI.useColdCCForColdCall(*OutF)) {
364 OutF->setCallingConv(CallingConv::Cold);
366 }
367 CI->setIsNoInline();
368
370 OutF->setSection(ColdSectionName);
371 else {
372 if (OrigF->hasSection())
373 OutF->setSection(OrigF->getSection());
374 }
375
376 markFunctionCold(*OutF, BFI != nullptr);
377
378 LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
379 ORE.emit([&]() {
380 return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
381 &*Region[0]->begin())
382 << ore::NV("Original", OrigF) << " split cold code into "
383 << ore::NV("Split", OutF);
384 });
385 return OutF;
386 }
387
388 ORE.emit([&]() {
389 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
390 &*Region[0]->begin())
391 << "Failed to extract region at block "
392 << ore::NV("Block", Region.front());
393 });
394 return nullptr;
395}
396
397/// A pair of (basic block, score).
398using BlockTy = std::pair<BasicBlock *, unsigned>;
399
400namespace {
401/// A maximal outlining region. This contains all blocks post-dominated by a
402/// sink block, the sink block itself, and all blocks dominated by the sink.
403/// If sink-predecessors and sink-successors cannot be extracted in one region,
404/// the static constructor returns a list of suitable extraction regions.
405class OutliningRegion {
406 /// A list of (block, score) pairs. A block's score is non-zero iff it's a
407 /// viable sub-region entry point. Blocks with higher scores are better entry
408 /// points (i.e. they are more distant ancestors of the sink block).
409 SmallVector<BlockTy, 0> Blocks = {};
410
411 /// The suggested entry point into the region. If the region has multiple
412 /// entry points, all blocks within the region may not be reachable from this
413 /// entry point.
414 BasicBlock *SuggestedEntryPoint = nullptr;
415
416 /// Whether the entire function is cold.
417 bool EntireFunctionCold = false;
418
419 /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise.
420 static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) {
421 return mayExtractBlock(BB) ? Score : 0;
422 }
423
424 /// These scores should be lower than the score for predecessor blocks,
425 /// because regions starting at predecessor blocks are typically larger.
426 static constexpr unsigned ScoreForSuccBlock = 1;
427 static constexpr unsigned ScoreForSinkBlock = 1;
428
429 OutliningRegion(const OutliningRegion &) = delete;
430 OutliningRegion &operator=(const OutliningRegion &) = delete;
431
432public:
433 OutliningRegion() = default;
434 OutliningRegion(OutliningRegion &&) = default;
435 OutliningRegion &operator=(OutliningRegion &&) = default;
436
437 static std::vector<OutliningRegion> create(BasicBlock &SinkBB,
438 const DominatorTree &DT,
439 const PostDominatorTree &PDT) {
440 std::vector<OutliningRegion> Regions;
441 SmallPtrSet<BasicBlock *, 4> RegionBlocks;
442
443 Regions.emplace_back();
444 OutliningRegion *ColdRegion = &Regions.back();
445
446 auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) {
447 RegionBlocks.insert(BB);
448 ColdRegion->Blocks.emplace_back(BB, Score);
449 };
450
451 // The ancestor farthest-away from SinkBB, and also post-dominated by it.
452 unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock);
453 ColdRegion->SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr;
454 unsigned BestScore = SinkScore;
455
456 // Visit SinkBB's ancestors using inverse DFS.
457 auto PredIt = ++idf_begin(&SinkBB);
458 auto PredEnd = idf_end(&SinkBB);
459 while (PredIt != PredEnd) {
460 BasicBlock &PredBB = **PredIt;
461 bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
462
463 // If the predecessor is cold and has no predecessors, the entire
464 // function must be cold.
465 if (SinkPostDom && pred_empty(&PredBB)) {
466 ColdRegion->EntireFunctionCold = true;
467 return Regions;
468 }
469
470 // If SinkBB does not post-dominate a predecessor, do not mark the
471 // predecessor (or any of its predecessors) cold.
472 if (!SinkPostDom || !mayExtractBlock(PredBB)) {
473 PredIt.skipChildren();
474 continue;
475 }
476
477 // Keep track of the post-dominated ancestor farthest away from the sink.
478 // The path length is always >= 2, ensuring that predecessor blocks are
479 // considered as entry points before the sink block.
480 unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength());
481 if (PredScore > BestScore) {
482 ColdRegion->SuggestedEntryPoint = &PredBB;
483 BestScore = PredScore;
484 }
485
486 addBlockToRegion(&PredBB, PredScore);
487 ++PredIt;
488 }
489
490 // If the sink can be added to the cold region, do so. It's considered as
491 // an entry point before any sink-successor blocks.
492 //
493 // Otherwise, split cold sink-successor blocks using a separate region.
494 // This satisfies the requirement that all extraction blocks other than the
495 // first have predecessors within the extraction region.
496 if (mayExtractBlock(SinkBB)) {
497 addBlockToRegion(&SinkBB, SinkScore);
498 if (pred_empty(&SinkBB)) {
499 ColdRegion->EntireFunctionCold = true;
500 return Regions;
501 }
502 } else {
503 Regions.emplace_back();
504 ColdRegion = &Regions.back();
505 BestScore = 0;
506 }
507
508 // Find all successors of SinkBB dominated by SinkBB using DFS.
509 auto SuccIt = ++df_begin(&SinkBB);
510 auto SuccEnd = df_end(&SinkBB);
511 while (SuccIt != SuccEnd) {
512 BasicBlock &SuccBB = **SuccIt;
513 bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
514
515 // Don't allow the backwards & forwards DFSes to mark the same block.
516 bool DuplicateBlock = RegionBlocks.count(&SuccBB);
517
518 // If SinkBB does not dominate a successor, do not mark the successor (or
519 // any of its successors) cold.
520 if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) {
521 SuccIt.skipChildren();
522 continue;
523 }
524
525 unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock);
526 if (SuccScore > BestScore) {
527 ColdRegion->SuggestedEntryPoint = &SuccBB;
528 BestScore = SuccScore;
529 }
530
531 addBlockToRegion(&SuccBB, SuccScore);
532 ++SuccIt;
533 }
534
535 return Regions;
536 }
537
538 /// Whether this region has nothing to extract.
539 bool empty() const { return !SuggestedEntryPoint; }
540
541 /// The blocks in this region.
542 ArrayRef<std::pair<BasicBlock *, unsigned>> blocks() const { return Blocks; }
543
544 /// Whether the entire function containing this region is cold.
545 bool isEntireFunctionCold() const { return EntireFunctionCold; }
546
547 /// Remove a sub-region from this region and return it as a block sequence.
548 BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) {
549 assert(!empty() && !isEntireFunctionCold() && "Nothing to extract");
550
551 // Remove blocks dominated by the suggested entry point from this region.
552 // During the removal, identify the next best entry point into the region.
553 // Ensure that the first extracted block is the suggested entry point.
554 BlockSequence SubRegion = {SuggestedEntryPoint};
555 BasicBlock *NextEntryPoint = nullptr;
556 unsigned NextScore = 0;
557 auto RegionEndIt = Blocks.end();
558 auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) {
559 BasicBlock *BB = Block.first;
560 unsigned Score = Block.second;
561 bool InSubRegion =
562 BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB);
563 if (!InSubRegion && Score > NextScore) {
564 NextEntryPoint = BB;
565 NextScore = Score;
566 }
567 if (InSubRegion && BB != SuggestedEntryPoint)
568 SubRegion.push_back(BB);
569 return InSubRegion;
570 });
571 Blocks.erase(RegionStartIt, RegionEndIt);
572
573 // Update the suggested entry point.
574 SuggestedEntryPoint = NextEntryPoint;
575
576 return SubRegion;
577 }
578};
579} // namespace
580
581bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
582 bool Changed = false;
583
584 // The set of cold blocks.
586
587 // The worklist of non-intersecting regions left to outline.
588 SmallVector<OutliningRegion, 2> OutliningWorklist;
589
590 // Set up an RPO traversal. Experimentally, this performs better (outlines
591 // more) than a PO traversal, because we prevent region overlap by keeping
592 // the first region to contain a block.
594
595 // Calculate domtrees lazily. This reduces compile-time significantly.
596 std::unique_ptr<DominatorTree> DT;
597 std::unique_ptr<PostDominatorTree> PDT;
598
599 // Calculate BFI lazily (it's only used to query ProfileSummaryInfo). This
600 // reduces compile-time significantly. TODO: When we *do* use BFI, we should
601 // be able to salvage its domtrees instead of recomputing them.
602 BlockFrequencyInfo *BFI = nullptr;
603 if (HasProfileSummary)
604 BFI = GetBFI(F);
605
606 TargetTransformInfo &TTI = GetTTI(F);
607 OptimizationRemarkEmitter &ORE = (*GetORE)(F);
608 AssumptionCache *AC = LookupAC(F);
609
610 // Find all cold regions.
611 for (BasicBlock *BB : RPOT) {
612 // This block is already part of some outlining region.
613 if (ColdBlocks.count(BB))
614 continue;
615
616 bool Cold = (BFI && PSI->isColdBlock(BB, BFI)) ||
617 (EnableStaticAnalysis && unlikelyExecuted(*BB));
618 if (!Cold)
619 continue;
620
621 LLVM_DEBUG({
622 dbgs() << "Found a cold block:\n";
623 BB->dump();
624 });
625
626 if (!DT)
627 DT = std::make_unique<DominatorTree>(F);
628 if (!PDT)
629 PDT = std::make_unique<PostDominatorTree>(F);
630
631 auto Regions = OutliningRegion::create(*BB, *DT, *PDT);
632 for (OutliningRegion &Region : Regions) {
633 if (Region.empty())
634 continue;
635
636 if (Region.isEntireFunctionCold()) {
637 LLVM_DEBUG(dbgs() << "Entire function is cold\n");
638 return markFunctionCold(F);
639 }
640
641 // If this outlining region intersects with another, drop the new region.
642 //
643 // TODO: It's theoretically possible to outline more by only keeping the
644 // largest region which contains a block, but the extra bookkeeping to do
645 // this is tricky/expensive.
646 bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
647 return !ColdBlocks.insert(Block.first).second;
648 });
649 if (RegionsOverlap)
650 continue;
651
652 OutliningWorklist.emplace_back(std::move(Region));
653 ++NumColdRegionsFound;
654 }
655 }
656
657 if (OutliningWorklist.empty())
658 return Changed;
659
660 // Outline single-entry cold regions, splitting up larger regions as needed.
661 unsigned OutlinedFunctionID = 1;
662 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
664 do {
665 OutliningRegion Region = OutliningWorklist.pop_back_val();
666 assert(!Region.empty() && "Empty outlining region in worklist");
667 do {
668 BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
669 LLVM_DEBUG({
670 dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
671 for (BasicBlock *BB : SubRegion)
672 BB->dump();
673 });
674
675 Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
676 ORE, AC, OutlinedFunctionID);
677 if (Outlined) {
678 ++OutlinedFunctionID;
679 Changed = true;
680 }
681 } while (!Region.empty());
682 } while (!OutliningWorklist.empty());
683
684 return Changed;
685}
686
688 bool Changed = false;
689 bool HasProfileSummary = (M.getProfileSummary(/* IsCS */ false) != nullptr);
690 for (Function &F : M) {
691 // Do not touch declarations.
692 if (F.isDeclaration())
693 continue;
694
695 // Do not modify `optnone` functions.
696 if (F.hasOptNone())
697 continue;
698
699 // Detect inherently cold functions and mark them as such.
700 if (isFunctionCold(F)) {
701 Changed |= markFunctionCold(F);
702 continue;
703 }
704
705 if (!shouldOutlineFrom(F)) {
706 LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n");
707 continue;
708 }
709
710 LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
711 Changed |= outlineColdRegions(F, HasProfileSummary);
712 }
713 return Changed;
714}
715
716bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
717 if (skipModule(M))
718 return false;
719 ProfileSummaryInfo *PSI =
720 &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
721 auto GTTI = [this](Function &F) -> TargetTransformInfo & {
722 return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
723 };
724 auto GBFI = [this](Function &F) {
725 return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
726 };
727 std::unique_ptr<OptimizationRemarkEmitter> ORE;
728 std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
729 [&ORE](Function &F) -> OptimizationRemarkEmitter & {
730 ORE.reset(new OptimizationRemarkEmitter(&F));
731 return *ORE;
732 };
733 auto LookupAC = [this](Function &F) -> AssumptionCache * {
734 if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
735 return ACT->lookupAssumptionCache(F);
736 return nullptr;
737 };
738
739 return HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M);
740}
741
744 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
745
746 auto LookupAC = [&FAM](Function &F) -> AssumptionCache * {
748 };
749
750 auto GBFI = [&FAM](Function &F) {
752 };
753
754 std::function<TargetTransformInfo &(Function &)> GTTI =
757 };
758
759 std::unique_ptr<OptimizationRemarkEmitter> ORE;
760 std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
761 [&ORE](Function &F) -> OptimizationRemarkEmitter & {
762 ORE.reset(new OptimizationRemarkEmitter(&F));
763 return *ORE;
764 };
765
767
768 if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M))
770 return PreservedAnalyses::all();
771}
772
773char HotColdSplittingLegacyPass::ID = 0;
774INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit",
775 "Hot Cold Splitting", false, false)
778INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit",
780
782 return new HotColdSplittingLegacyPass();
783}
static bool blockEndsInUnreachable(const MachineBasicBlock *MBB)
A no successor, non-return block probably ends in unreachable and is cold.
#define LLVM_DEBUG(X)
Definition: Debug.h:101
#define DEBUG_TYPE
std::pair< BasicBlock *, unsigned > BlockTy
A pair of (basic block, score).
static cl::opt< int > SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden, cl::desc("Base penalty for splitting cold code (as a " "multiple of TCC_Basic)"))
static cl::opt< std::string > ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"), cl::Hidden, cl::desc("Name for the section containing cold functions " "extracted by hot-cold splitting."))
static cl::opt< int > MaxParametersForSplit("hotcoldsplit-max-params", cl::init(4), cl::Hidden, cl::desc("Maximum number of parameters for a split function"))
static InstructionCost getOutliningBenefit(ArrayRef< BasicBlock * > Region, TargetTransformInfo &TTI)
Get the benefit score of outlining Region.
static cl::opt< bool > EnableColdSection("enable-cold-section", cl::init(false), cl::Hidden, cl::desc("Enable placement of extracted cold functions" " into a separate section after hot-cold splitting."))
hotcoldsplit
static cl::opt< bool > EnableStaticAnalysis("hot-cold-static-analysis", cl::init(true), cl::Hidden)
static int getOutliningPenalty(ArrayRef< BasicBlock * > Region, unsigned NumInputs, unsigned NumOutputs)
Get the penalty score for outlining Region.
Hot Cold Splitting
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Module.h This file contains the declarations for the Module class.
FunctionAnalysisManager FAM
This header defines various interfaces for pass management in LLVM.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:793
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
Represent the analysis usage information of a pass.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:103
bool empty() const
Definition: BasicBlock.h:325
bool hasAddressTaken() const
Returns true if there are any uses of this basic block other than direct branches,...
Definition: BasicBlock.h:495
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:512
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
Analysis pass which computes BlockFrequencyInfo.
Legacy analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
void setCallingConv(CallingConv::ID CC)
Definition: InstrTypes.h:1469
void setIsNoInline()
Definition: InstrTypes.h:1868
This class represents a function call, abstracting a target machine's calling convention.
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
StringRef getSection() const
Get the custom section of this global if it has one.
Definition: GlobalObject.h:117
bool hasSection() const
Check if this global has a custom object file section.
Definition: GlobalObject.h:109
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:933
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:248
virtual bool runOnModule(Module &M)=0
runOnModule - Virtual method overriden by subclasses to process the module being operated on.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
bool dominates(const Instruction *I1, const Instruction *I2) const
Return true if I1 dominates I2.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:155
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) const
Returns true if BasicBlock BB is considered cold.
bool isFunctionEntryCold(const Function *F) const
Returns true if F has cold function entry.
block_range blocks()
Returns a range view of the basic blocks in the region.
Definition: RegionInfo.h:622
RegionT * getParent() const
Get the parent of the Region.
Definition: RegionInfo.h:364
A vector that has set insertion semantics.
Definition: SetVector.h:40
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
size_type size() const
Definition: SmallPtrSet.h:93
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
bool empty() const
Definition: SmallVector.h:94
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
iterator erase(const_iterator CI)
Definition: SmallVector.h:741
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
@ TCK_CodeSize
Instruction code size.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
bool useColdCCForColdCall(Function &F) const
Return true if the input function which is cold at all call sites, should use coldcc calling conventi...
user_iterator user_begin()
Definition: Value.h:397
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4938
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ CE
Windows NT (Windows on ARM)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool succ_empty(const Instruction *I)
Definition: CFG.h:255
auto successors(const MachineBasicBlock *BB)
df_iterator< T > df_begin(const T &G)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
void initializeHotColdSplittingLegacyPassPass(PassRegistry &)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
ModulePass * createHotColdSplittingPass()
createHotColdSplittingPass - This pass outlines cold blocks into a separate function(s).
idf_iterator< T > idf_end(const T &G)
auto remove_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1774
idf_iterator< T > idf_begin(const T &G)
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1869
bool pred_empty(const BasicBlock *BB)
Definition: CFG.h:118
df_iterator< T > df_end(const T &G)