LLVM 17.0.0git
HotColdSplitting.cpp
Go to the documentation of this file.
1//===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// The goal of hot/cold splitting is to improve the memory locality of code.
11/// The splitting pass does this by identifying cold blocks and moving them into
12/// separate functions.
13///
14/// When the splitting pass finds a cold block (referred to as "the sink"), it
15/// grows a maximal cold region around that block. The maximal region contains
16/// all blocks (post-)dominated by the sink [*]. In theory, these blocks are as
17/// cold as the sink. Once a region is found, it's split out of the original
18/// function provided it's profitable to do so.
19///
20/// [*] In practice, there is some added complexity because some blocks are not
21/// safe to extract.
22///
23/// TODO: Use the PM to get domtrees, and preserve BFI/BPI.
24/// TODO: Reorder outlined functions.
25///
26//===----------------------------------------------------------------------===//
27
31#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/BasicBlock.h"
39#include "llvm/IR/CFG.h"
41#include "llvm/IR/Dominators.h"
42#include "llvm/IR/Function.h"
43#include "llvm/IR/Instruction.h"
45#include "llvm/IR/Module.h"
46#include "llvm/IR/PassManager.h"
47#include "llvm/IR/User.h"
48#include "llvm/IR/Value.h"
50#include "llvm/Support/Debug.h"
52#include "llvm/Transforms/IPO.h"
54#include <algorithm>
55#include <cassert>
56#include <limits>
57#include <string>
58
59#define DEBUG_TYPE "hotcoldsplit"
60
61STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
62STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
63
64using namespace llvm;
65
66static cl::opt<bool> EnableStaticAnalysis("hot-cold-static-analysis",
67 cl::init(true), cl::Hidden);
68
69static cl::opt<int>
70 SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden,
71 cl::desc("Base penalty for splitting cold code (as a "
72 "multiple of TCC_Basic)"));
73
75 "enable-cold-section", cl::init(false), cl::Hidden,
76 cl::desc("Enable placement of extracted cold functions"
77 " into a separate section after hot-cold splitting."));
78
80 ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"),
82 cl::desc("Name for the section containing cold functions "
83 "extracted by hot-cold splitting."));
84
86 "hotcoldsplit-max-params", cl::init(4), cl::Hidden,
87 cl::desc("Maximum number of parameters for a split function"));
88
89namespace {
90// Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
91// this function unless you modify the MBB version as well.
92//
93/// A no successor, non-return block probably ends in unreachable and is cold.
94/// Also consider a block that ends in an indirect branch to be a return block,
95/// since many targets use plain indirect branches to return.
96bool blockEndsInUnreachable(const BasicBlock &BB) {
97 if (!succ_empty(&BB))
98 return false;
99 if (BB.empty())
100 return true;
101 const Instruction *I = BB.getTerminator();
102 return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
103}
104
105bool unlikelyExecuted(BasicBlock &BB) {
106 // Exception handling blocks are unlikely executed.
107 if (BB.isEHPad() || isa<ResumeInst>(BB.getTerminator()))
108 return true;
109
110 // The block is cold if it calls/invokes a cold function. However, do not
111 // mark sanitizer traps as cold.
112 for (Instruction &I : BB)
113 if (auto *CB = dyn_cast<CallBase>(&I))
114 if (CB->hasFnAttr(Attribute::Cold) &&
115 !CB->getMetadata(LLVMContext::MD_nosanitize))
116 return true;
117
118 // The block is cold if it has an unreachable terminator, unless it's
119 // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp).
120 if (blockEndsInUnreachable(BB)) {
121 if (auto *CI =
122 dyn_cast_or_null<CallInst>(BB.getTerminator()->getPrevNode()))
123 if (CI->hasFnAttr(Attribute::NoReturn))
124 return false;
125 return true;
126 }
127
128 return false;
129}
130
131/// Check whether it's safe to outline \p BB.
132static bool mayExtractBlock(const BasicBlock &BB) {
133 // EH pads are unsafe to outline because doing so breaks EH type tables. It
134 // follows that invoke instructions cannot be extracted, because CodeExtractor
135 // requires unwind destinations to be within the extraction region.
136 //
137 // Resumes that are not reachable from a cleanup landing pad are considered to
138 // be unreachable. It’s not safe to split them out either.
139 if (BB.hasAddressTaken() || BB.isEHPad())
140 return false;
141 auto Term = BB.getTerminator();
142 return !isa<InvokeInst>(Term) && !isa<ResumeInst>(Term);
143}
144
145/// Mark \p F cold. Based on this assumption, also optimize it for minimum size.
146/// If \p UpdateEntryCount is true (set when this is a new split function and
147/// module has profile data), set entry count to 0 to ensure treated as cold.
148/// Return true if the function is changed.
149static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) {
150 assert(!F.hasOptNone() && "Can't mark this cold");
151 bool Changed = false;
152 if (!F.hasFnAttribute(Attribute::Cold)) {
153 F.addFnAttr(Attribute::Cold);
154 Changed = true;
155 }
156 if (!F.hasFnAttribute(Attribute::MinSize)) {
157 F.addFnAttr(Attribute::MinSize);
158 Changed = true;
159 }
160 if (UpdateEntryCount) {
161 // Set the entry count to 0 to ensure it is placed in the unlikely text
162 // section when function sections are enabled.
163 F.setEntryCount(0);
164 Changed = true;
165 }
166
167 return Changed;
168}
169
170} // end anonymous namespace
171
172/// Check whether \p F is inherently cold.
173bool HotColdSplitting::isFunctionCold(const Function &F) const {
174 if (F.hasFnAttribute(Attribute::Cold))
175 return true;
176
177 if (F.getCallingConv() == CallingConv::Cold)
178 return true;
179
180 if (PSI->isFunctionEntryCold(&F))
181 return true;
182
183 return false;
184}
185
186// Returns false if the function should not be considered for hot-cold split
187// optimization.
188bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
189 if (F.hasFnAttribute(Attribute::AlwaysInline))
190 return false;
191
192 if (F.hasFnAttribute(Attribute::NoInline))
193 return false;
194
195 // A function marked `noreturn` may contain unreachable terminators: these
196 // should not be considered cold, as the function may be a trampoline.
197 if (F.hasFnAttribute(Attribute::NoReturn))
198 return false;
199
200 if (F.hasFnAttribute(Attribute::SanitizeAddress) ||
201 F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
202 F.hasFnAttribute(Attribute::SanitizeThread) ||
203 F.hasFnAttribute(Attribute::SanitizeMemory))
204 return false;
205
206 return true;
207}
208
209/// Get the benefit score of outlining \p Region.
212 // Sum up the code size costs of non-terminator instructions. Tight coupling
213 // with \ref getOutliningPenalty is needed to model the costs of terminators.
214 InstructionCost Benefit = 0;
215 for (BasicBlock *BB : Region)
217 if (&I != BB->getTerminator())
218 Benefit +=
220
221 return Benefit;
222}
223
224/// Get the penalty score for outlining \p Region.
226 unsigned NumInputs, unsigned NumOutputs) {
227 int Penalty = SplittingThreshold;
228 LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n");
229
230 // If the splitting threshold is set at or below zero, skip the usual
231 // profitability check.
232 if (SplittingThreshold <= 0)
233 return Penalty;
234
235 // Find the number of distinct exit blocks for the region. Use a conservative
236 // check to determine whether control returns from the region.
237 bool NoBlocksReturn = true;
238 SmallPtrSet<BasicBlock *, 2> SuccsOutsideRegion;
239 for (BasicBlock *BB : Region) {
240 // If a block has no successors, only assume it does not return if it's
241 // unreachable.
242 if (succ_empty(BB)) {
243 NoBlocksReturn &= isa<UnreachableInst>(BB->getTerminator());
244 continue;
245 }
246
247 for (BasicBlock *SuccBB : successors(BB)) {
248 if (!is_contained(Region, SuccBB)) {
249 NoBlocksReturn = false;
250 SuccsOutsideRegion.insert(SuccBB);
251 }
252 }
253 }
254
255 // Count the number of phis in exit blocks with >= 2 incoming values from the
256 // outlining region. These phis are split (\ref severSplitPHINodesOfExits),
257 // and new outputs are created to supply the split phis. CodeExtractor can't
258 // report these new outputs until extraction begins, but it's important to
259 // factor the cost of the outputs into the cost calculation.
260 unsigned NumSplitExitPhis = 0;
261 for (BasicBlock *ExitBB : SuccsOutsideRegion) {
262 for (PHINode &PN : ExitBB->phis()) {
263 // Find all incoming values from the outlining region.
264 int NumIncomingVals = 0;
265 for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
266 if (llvm::is_contained(Region, PN.getIncomingBlock(i))) {
267 ++NumIncomingVals;
268 if (NumIncomingVals > 1) {
269 ++NumSplitExitPhis;
270 break;
271 }
272 }
273 }
274 }
275
276 // Apply a penalty for calling the split function. Factor in the cost of
277 // materializing all of the parameters.
278 int NumOutputsAndSplitPhis = NumOutputs + NumSplitExitPhis;
279 int NumParams = NumInputs + NumOutputsAndSplitPhis;
280 if (NumParams > MaxParametersForSplit) {
281 LLVM_DEBUG(dbgs() << NumInputs << " inputs and " << NumOutputsAndSplitPhis
282 << " outputs exceeds parameter limit ("
283 << MaxParametersForSplit << ")\n");
284 return std::numeric_limits<int>::max();
285 }
286 const int CostForArgMaterialization = 2 * TargetTransformInfo::TCC_Basic;
287 LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumParams << " params\n");
288 Penalty += CostForArgMaterialization * NumParams;
289
290 // Apply the typical code size cost for an output alloca and its associated
291 // reload in the caller. Also penalize the associated store in the callee.
292 LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputsAndSplitPhis
293 << " outputs/split phis\n");
294 const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
295 Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
296
297 // Apply a `noreturn` bonus.
298 if (NoBlocksReturn) {
299 LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
300 << " non-returning terminators\n");
301 Penalty -= Region.size();
302 }
303
304 // Apply a penalty for having more than one successor outside of the region.
305 // This penalty accounts for the switch needed in the caller.
306 if (SuccsOutsideRegion.size() > 1) {
307 LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
308 << " non-region successors\n");
309 Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
310 }
311
312 return Penalty;
313}
314
315Function *HotColdSplitting::extractColdRegion(
318 OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
319 assert(!Region.empty());
320
321 // TODO: Pass BFI and BPI to update profile information.
322 CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
323 /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
324 /* AllowAlloca */ false, /* AllocaBlock */ nullptr,
325 /* Suffix */ "cold." + std::to_string(Count));
326
327 // Perform a simple cost/benefit analysis to decide whether or not to permit
328 // splitting.
329 SetVector<Value *> Inputs, Outputs, Sinks;
330 CE.findInputsOutputs(Inputs, Outputs, Sinks);
331 InstructionCost OutliningBenefit = getOutliningBenefit(Region, TTI);
332 int OutliningPenalty =
333 getOutliningPenalty(Region, Inputs.size(), Outputs.size());
334 LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
335 << ", penalty = " << OutliningPenalty << "\n");
336 if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty)
337 return nullptr;
338
339 Function *OrigF = Region[0]->getParent();
340 if (Function *OutF = CE.extractCodeRegion(CEAC)) {
341 User *U = *OutF->user_begin();
342 CallInst *CI = cast<CallInst>(U);
343 NumColdRegionsOutlined++;
344 if (TTI.useColdCCForColdCall(*OutF)) {
345 OutF->setCallingConv(CallingConv::Cold);
347 }
348 CI->setIsNoInline();
349
351 OutF->setSection(ColdSectionName);
352 else {
353 if (OrigF->hasSection())
354 OutF->setSection(OrigF->getSection());
355 }
356
357 markFunctionCold(*OutF, BFI != nullptr);
358
359 LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
360 ORE.emit([&]() {
361 return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
362 &*Region[0]->begin())
363 << ore::NV("Original", OrigF) << " split cold code into "
364 << ore::NV("Split", OutF);
365 });
366 return OutF;
367 }
368
369 ORE.emit([&]() {
370 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
371 &*Region[0]->begin())
372 << "Failed to extract region at block "
373 << ore::NV("Block", Region.front());
374 });
375 return nullptr;
376}
377
378/// A pair of (basic block, score).
379using BlockTy = std::pair<BasicBlock *, unsigned>;
380
381namespace {
382/// A maximal outlining region. This contains all blocks post-dominated by a
383/// sink block, the sink block itself, and all blocks dominated by the sink.
384/// If sink-predecessors and sink-successors cannot be extracted in one region,
385/// the static constructor returns a list of suitable extraction regions.
386class OutliningRegion {
387 /// A list of (block, score) pairs. A block's score is non-zero iff it's a
388 /// viable sub-region entry point. Blocks with higher scores are better entry
389 /// points (i.e. they are more distant ancestors of the sink block).
391
392 /// The suggested entry point into the region. If the region has multiple
393 /// entry points, all blocks within the region may not be reachable from this
394 /// entry point.
395 BasicBlock *SuggestedEntryPoint = nullptr;
396
397 /// Whether the entire function is cold.
398 bool EntireFunctionCold = false;
399
400 /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise.
401 static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) {
402 return mayExtractBlock(BB) ? Score : 0;
403 }
404
405 /// These scores should be lower than the score for predecessor blocks,
406 /// because regions starting at predecessor blocks are typically larger.
407 static constexpr unsigned ScoreForSuccBlock = 1;
408 static constexpr unsigned ScoreForSinkBlock = 1;
409
410 OutliningRegion(const OutliningRegion &) = delete;
411 OutliningRegion &operator=(const OutliningRegion &) = delete;
412
413public:
414 OutliningRegion() = default;
415 OutliningRegion(OutliningRegion &&) = default;
416 OutliningRegion &operator=(OutliningRegion &&) = default;
417
418 static std::vector<OutliningRegion> create(BasicBlock &SinkBB,
419 const DominatorTree &DT,
420 const PostDominatorTree &PDT) {
421 std::vector<OutliningRegion> Regions;
422 SmallPtrSet<BasicBlock *, 4> RegionBlocks;
423
424 Regions.emplace_back();
425 OutliningRegion *ColdRegion = &Regions.back();
426
427 auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) {
428 RegionBlocks.insert(BB);
429 ColdRegion->Blocks.emplace_back(BB, Score);
430 };
431
432 // The ancestor farthest-away from SinkBB, and also post-dominated by it.
433 unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock);
434 ColdRegion->SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr;
435 unsigned BestScore = SinkScore;
436
437 // Visit SinkBB's ancestors using inverse DFS.
438 auto PredIt = ++idf_begin(&SinkBB);
439 auto PredEnd = idf_end(&SinkBB);
440 while (PredIt != PredEnd) {
441 BasicBlock &PredBB = **PredIt;
442 bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
443
444 // If the predecessor is cold and has no predecessors, the entire
445 // function must be cold.
446 if (SinkPostDom && pred_empty(&PredBB)) {
447 ColdRegion->EntireFunctionCold = true;
448 return Regions;
449 }
450
451 // If SinkBB does not post-dominate a predecessor, do not mark the
452 // predecessor (or any of its predecessors) cold.
453 if (!SinkPostDom || !mayExtractBlock(PredBB)) {
454 PredIt.skipChildren();
455 continue;
456 }
457
458 // Keep track of the post-dominated ancestor farthest away from the sink.
459 // The path length is always >= 2, ensuring that predecessor blocks are
460 // considered as entry points before the sink block.
461 unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength());
462 if (PredScore > BestScore) {
463 ColdRegion->SuggestedEntryPoint = &PredBB;
464 BestScore = PredScore;
465 }
466
467 addBlockToRegion(&PredBB, PredScore);
468 ++PredIt;
469 }
470
471 // If the sink can be added to the cold region, do so. It's considered as
472 // an entry point before any sink-successor blocks.
473 //
474 // Otherwise, split cold sink-successor blocks using a separate region.
475 // This satisfies the requirement that all extraction blocks other than the
476 // first have predecessors within the extraction region.
477 if (mayExtractBlock(SinkBB)) {
478 addBlockToRegion(&SinkBB, SinkScore);
479 if (pred_empty(&SinkBB)) {
480 ColdRegion->EntireFunctionCold = true;
481 return Regions;
482 }
483 } else {
484 Regions.emplace_back();
485 ColdRegion = &Regions.back();
486 BestScore = 0;
487 }
488
489 // Find all successors of SinkBB dominated by SinkBB using DFS.
490 auto SuccIt = ++df_begin(&SinkBB);
491 auto SuccEnd = df_end(&SinkBB);
492 while (SuccIt != SuccEnd) {
493 BasicBlock &SuccBB = **SuccIt;
494 bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
495
496 // Don't allow the backwards & forwards DFSes to mark the same block.
497 bool DuplicateBlock = RegionBlocks.count(&SuccBB);
498
499 // If SinkBB does not dominate a successor, do not mark the successor (or
500 // any of its successors) cold.
501 if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) {
502 SuccIt.skipChildren();
503 continue;
504 }
505
506 unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock);
507 if (SuccScore > BestScore) {
508 ColdRegion->SuggestedEntryPoint = &SuccBB;
509 BestScore = SuccScore;
510 }
511
512 addBlockToRegion(&SuccBB, SuccScore);
513 ++SuccIt;
514 }
515
516 return Regions;
517 }
518
519 /// Whether this region has nothing to extract.
520 bool empty() const { return !SuggestedEntryPoint; }
521
522 /// The blocks in this region.
523 ArrayRef<std::pair<BasicBlock *, unsigned>> blocks() const { return Blocks; }
524
525 /// Whether the entire function containing this region is cold.
526 bool isEntireFunctionCold() const { return EntireFunctionCold; }
527
528 /// Remove a sub-region from this region and return it as a block sequence.
529 BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) {
530 assert(!empty() && !isEntireFunctionCold() && "Nothing to extract");
531
532 // Remove blocks dominated by the suggested entry point from this region.
533 // During the removal, identify the next best entry point into the region.
534 // Ensure that the first extracted block is the suggested entry point.
535 BlockSequence SubRegion = {SuggestedEntryPoint};
536 BasicBlock *NextEntryPoint = nullptr;
537 unsigned NextScore = 0;
538 auto RegionEndIt = Blocks.end();
539 auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) {
540 BasicBlock *BB = Block.first;
541 unsigned Score = Block.second;
542 bool InSubRegion =
543 BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB);
544 if (!InSubRegion && Score > NextScore) {
545 NextEntryPoint = BB;
546 NextScore = Score;
547 }
548 if (InSubRegion && BB != SuggestedEntryPoint)
549 SubRegion.push_back(BB);
550 return InSubRegion;
551 });
552 Blocks.erase(RegionStartIt, RegionEndIt);
553
554 // Update the suggested entry point.
555 SuggestedEntryPoint = NextEntryPoint;
556
557 return SubRegion;
558 }
559};
560} // namespace
561
562bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
563 bool Changed = false;
564
565 // The set of cold blocks.
567
568 // The worklist of non-intersecting regions left to outline.
569 SmallVector<OutliningRegion, 2> OutliningWorklist;
570
571 // Set up an RPO traversal. Experimentally, this performs better (outlines
572 // more) than a PO traversal, because we prevent region overlap by keeping
573 // the first region to contain a block.
575
576 // Calculate domtrees lazily. This reduces compile-time significantly.
577 std::unique_ptr<DominatorTree> DT;
578 std::unique_ptr<PostDominatorTree> PDT;
579
580 // Calculate BFI lazily (it's only used to query ProfileSummaryInfo). This
581 // reduces compile-time significantly. TODO: When we *do* use BFI, we should
582 // be able to salvage its domtrees instead of recomputing them.
583 BlockFrequencyInfo *BFI = nullptr;
584 if (HasProfileSummary)
585 BFI = GetBFI(F);
586
587 TargetTransformInfo &TTI = GetTTI(F);
588 OptimizationRemarkEmitter &ORE = (*GetORE)(F);
589 AssumptionCache *AC = LookupAC(F);
590
591 // Find all cold regions.
592 for (BasicBlock *BB : RPOT) {
593 // This block is already part of some outlining region.
594 if (ColdBlocks.count(BB))
595 continue;
596
597 bool Cold = (BFI && PSI->isColdBlock(BB, BFI)) ||
598 (EnableStaticAnalysis && unlikelyExecuted(*BB));
599 if (!Cold)
600 continue;
601
602 LLVM_DEBUG({
603 dbgs() << "Found a cold block:\n";
604 BB->dump();
605 });
606
607 if (!DT)
608 DT = std::make_unique<DominatorTree>(F);
609 if (!PDT)
610 PDT = std::make_unique<PostDominatorTree>(F);
611
612 auto Regions = OutliningRegion::create(*BB, *DT, *PDT);
613 for (OutliningRegion &Region : Regions) {
614 if (Region.empty())
615 continue;
616
617 if (Region.isEntireFunctionCold()) {
618 LLVM_DEBUG(dbgs() << "Entire function is cold\n");
619 return markFunctionCold(F);
620 }
621
622 // If this outlining region intersects with another, drop the new region.
623 //
624 // TODO: It's theoretically possible to outline more by only keeping the
625 // largest region which contains a block, but the extra bookkeeping to do
626 // this is tricky/expensive.
627 bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
628 return !ColdBlocks.insert(Block.first).second;
629 });
630 if (RegionsOverlap)
631 continue;
632
633 OutliningWorklist.emplace_back(std::move(Region));
634 ++NumColdRegionsFound;
635 }
636 }
637
638 if (OutliningWorklist.empty())
639 return Changed;
640
641 // Outline single-entry cold regions, splitting up larger regions as needed.
642 unsigned OutlinedFunctionID = 1;
643 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
645 do {
646 OutliningRegion Region = OutliningWorklist.pop_back_val();
647 assert(!Region.empty() && "Empty outlining region in worklist");
648 do {
649 BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
650 LLVM_DEBUG({
651 dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
652 for (BasicBlock *BB : SubRegion)
653 BB->dump();
654 });
655
656 Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
657 ORE, AC, OutlinedFunctionID);
658 if (Outlined) {
659 ++OutlinedFunctionID;
660 Changed = true;
661 }
662 } while (!Region.empty());
663 } while (!OutliningWorklist.empty());
664
665 return Changed;
666}
667
669 bool Changed = false;
670 bool HasProfileSummary = (M.getProfileSummary(/* IsCS */ false) != nullptr);
671 for (Function &F : M) {
672 // Do not touch declarations.
673 if (F.isDeclaration())
674 continue;
675
676 // Do not modify `optnone` functions.
677 if (F.hasOptNone())
678 continue;
679
680 // Detect inherently cold functions and mark them as such.
681 if (isFunctionCold(F)) {
682 Changed |= markFunctionCold(F);
683 continue;
684 }
685
686 if (!shouldOutlineFrom(F)) {
687 LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n");
688 continue;
689 }
690
691 LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
692 Changed |= outlineColdRegions(F, HasProfileSummary);
693 }
694 return Changed;
695}
696
699 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
700
701 auto LookupAC = [&FAM](Function &F) -> AssumptionCache * {
703 };
704
705 auto GBFI = [&FAM](Function &F) {
707 };
708
709 std::function<TargetTransformInfo &(Function &)> GTTI =
712 };
713
714 std::unique_ptr<OptimizationRemarkEmitter> ORE;
715 std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
716 [&ORE](Function &F) -> OptimizationRemarkEmitter & {
717 ORE.reset(new OptimizationRemarkEmitter(&F));
718 return *ORE;
719 };
720
722
723 if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M))
725 return PreservedAnalyses::all();
726}
static bool blockEndsInUnreachable(const MachineBasicBlock *MBB)
A no successor, non-return block probably ends in unreachable and is cold.
#define LLVM_DEBUG(X)
Definition: Debug.h:101
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:491
#define DEBUG_TYPE
std::pair< BasicBlock *, unsigned > BlockTy
A pair of (basic block, score).
static cl::opt< int > SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden, cl::desc("Base penalty for splitting cold code (as a " "multiple of TCC_Basic)"))
static cl::opt< std::string > ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"), cl::Hidden, cl::desc("Name for the section containing cold functions " "extracted by hot-cold splitting."))
static cl::opt< int > MaxParametersForSplit("hotcoldsplit-max-params", cl::init(4), cl::Hidden, cl::desc("Maximum number of parameters for a split function"))
static InstructionCost getOutliningBenefit(ArrayRef< BasicBlock * > Region, TargetTransformInfo &TTI)
Get the benefit score of outlining Region.
static cl::opt< bool > EnableColdSection("enable-cold-section", cl::init(false), cl::Hidden, cl::desc("Enable placement of extracted cold functions" " into a separate section after hot-cold splitting."))
static cl::opt< bool > EnableStaticAnalysis("hot-cold-static-analysis", cl::init(true), cl::Hidden)
static int getOutliningPenalty(ArrayRef< BasicBlock * > Region, unsigned NumInputs, unsigned NumOutputs)
Get the penalty score for outlining Region.
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Module.h This file contains the declarations for the Module class.
FunctionAnalysisManager FAM
This header defines various interfaces for pass management in LLVM.
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:793
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:103
bool empty() const
Definition: BasicBlock.h:334
bool hasAddressTaken() const
Returns true if there are any uses of this basic block other than direct branches,...
Definition: BasicBlock.h:504
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:521
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
void setCallingConv(CallingConv::ID CC)
Definition: InstrTypes.h:1475
void setIsNoInline()
Definition: InstrTypes.h:1887
This class represents a function call, abstracting a target machine's calling convention.
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
StringRef getSection() const
Get the custom section of this global if it has one.
Definition: GlobalObject.h:117
bool hasSection() const
Check if this global has a custom object file section.
Definition: GlobalObject.h:109
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:933
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
bool dominates(const Instruction *I1, const Instruction *I2) const
Return true if I1 dominates I2.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:155
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) const
Returns true if BasicBlock BB is considered cold.
bool isFunctionEntryCold(const Function *F) const
Returns true if F has cold function entry.
block_range blocks()
Returns a range view of the basic blocks in the region.
Definition: RegionInfo.h:622
RegionT * getParent() const
Get the parent of the Region.
Definition: RegionInfo.h:364
A vector that has set insertion semantics.
Definition: SetVector.h:51
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:88
size_type size() const
Definition: SmallPtrSet.h:93
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
bool empty() const
Definition: SmallVector.h:94
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
@ TCK_CodeSize
Instruction code size.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
bool useColdCCForColdCall(Function &F) const
Return true if the input function which is cold at all call sites, should use coldcc calling conventi...
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4938
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ CE
Windows NT (Windows on ARM)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool succ_empty(const Instruction *I)
Definition: CFG.h:255
auto successors(const MachineBasicBlock *BB)
df_iterator< T > df_begin(const T &G)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1826
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
idf_iterator< T > idf_end(const T &G)
auto remove_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1858
idf_iterator< T > idf_begin(const T &G)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1976
bool pred_empty(const BasicBlock *BB)
Definition: CFG.h:118
df_iterator< T > df_end(const T &G)