LLVM  16.0.0git
HotColdSplitting.cpp
Go to the documentation of this file.
1 //===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// The goal of hot/cold splitting is to improve the memory locality of code.
11 /// The splitting pass does this by identifying cold blocks and moving them into
12 /// separate functions.
13 ///
14 /// When the splitting pass finds a cold block (referred to as "the sink"), it
15 /// grows a maximal cold region around that block. The maximal region contains
16 /// all blocks (post-)dominated by the sink [*]. In theory, these blocks are as
17 /// cold as the sink. Once a region is found, it's split out of the original
18 /// function provided it's profitable to do so.
19 ///
20 /// [*] In practice, there is some added complexity because some blocks are not
21 /// safe to extract.
22 ///
23 /// TODO: Use the PM to get domtrees, and preserve BFI/BPI.
24 /// TODO: Reorder outlined functions.
25 ///
26 //===----------------------------------------------------------------------===//
27 
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
38 #include "llvm/IR/BasicBlock.h"
39 #include "llvm/IR/CFG.h"
40 #include "llvm/IR/DiagnosticInfo.h"
41 #include "llvm/IR/Dominators.h"
42 #include "llvm/IR/Function.h"
43 #include "llvm/IR/Instruction.h"
44 #include "llvm/IR/Instructions.h"
45 #include "llvm/IR/Module.h"
46 #include "llvm/IR/PassManager.h"
47 #include "llvm/IR/User.h"
48 #include "llvm/IR/Value.h"
49 #include "llvm/InitializePasses.h"
50 #include "llvm/Pass.h"
52 #include "llvm/Support/Debug.h"
54 #include "llvm/Transforms/IPO.h"
56 #include <algorithm>
57 #include <cassert>
58 #include <limits>
59 #include <string>
60 
61 #define DEBUG_TYPE "hotcoldsplit"
62 
63 STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
64 STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
65 
66 using namespace llvm;
67 
68 static cl::opt<bool> EnableStaticAnalysis("hot-cold-static-analysis",
69  cl::init(true), cl::Hidden);
70 
71 static cl::opt<int>
72  SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden,
73  cl::desc("Base penalty for splitting cold code (as a "
74  "multiple of TCC_Basic)"));
75 
77  "enable-cold-section", cl::init(false), cl::Hidden,
78  cl::desc("Enable placement of extracted cold functions"
79  " into a separate section after hot-cold splitting."));
80 
82  ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"),
83  cl::Hidden,
84  cl::desc("Name for the section containing cold functions "
85  "extracted by hot-cold splitting."));
86 
88  "hotcoldsplit-max-params", cl::init(4), cl::Hidden,
89  cl::desc("Maximum number of parameters for a split function"));
90 
91 namespace {
92 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
93 // this function unless you modify the MBB version as well.
94 //
95 /// A no successor, non-return block probably ends in unreachable and is cold.
96 /// Also consider a block that ends in an indirect branch to be a return block,
97 /// since many targets use plain indirect branches to return.
99  if (!succ_empty(&BB))
100  return false;
101  if (BB.empty())
102  return true;
103  const Instruction *I = BB.getTerminator();
104  return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
105 }
106 
107 bool unlikelyExecuted(BasicBlock &BB) {
108  // Exception handling blocks are unlikely executed.
109  if (BB.isEHPad() || isa<ResumeInst>(BB.getTerminator()))
110  return true;
111 
112  // The block is cold if it calls/invokes a cold function. However, do not
113  // mark sanitizer traps as cold.
114  for (Instruction &I : BB)
115  if (auto *CB = dyn_cast<CallBase>(&I))
116  if (CB->hasFnAttr(Attribute::Cold) &&
117  !CB->getMetadata(LLVMContext::MD_nosanitize))
118  return true;
119 
120  // The block is cold if it has an unreachable terminator, unless it's
121  // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp).
122  if (blockEndsInUnreachable(BB)) {
123  if (auto *CI =
124  dyn_cast_or_null<CallInst>(BB.getTerminator()->getPrevNode()))
125  if (CI->hasFnAttr(Attribute::NoReturn))
126  return false;
127  return true;
128  }
129 
130  return false;
131 }
132 
133 /// Check whether it's safe to outline \p BB.
134 static bool mayExtractBlock(const BasicBlock &BB) {
135  // EH pads are unsafe to outline because doing so breaks EH type tables. It
136  // follows that invoke instructions cannot be extracted, because CodeExtractor
137  // requires unwind destinations to be within the extraction region.
138  //
139  // Resumes that are not reachable from a cleanup landing pad are considered to
140  // be unreachable. It’s not safe to split them out either.
141  if (BB.hasAddressTaken() || BB.isEHPad())
142  return false;
143  auto Term = BB.getTerminator();
144  return !isa<InvokeInst>(Term) && !isa<ResumeInst>(Term);
145 }
146 
147 /// Mark \p F cold. Based on this assumption, also optimize it for minimum size.
148 /// If \p UpdateEntryCount is true (set when this is a new split function and
149 /// module has profile data), set entry count to 0 to ensure treated as cold.
150 /// Return true if the function is changed.
151 static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) {
152  assert(!F.hasOptNone() && "Can't mark this cold");
153  bool Changed = false;
154  if (!F.hasFnAttribute(Attribute::Cold)) {
155  F.addFnAttr(Attribute::Cold);
156  Changed = true;
157  }
158  if (!F.hasFnAttribute(Attribute::MinSize)) {
159  F.addFnAttr(Attribute::MinSize);
160  Changed = true;
161  }
162  if (UpdateEntryCount) {
163  // Set the entry count to 0 to ensure it is placed in the unlikely text
164  // section when function sections are enabled.
165  F.setEntryCount(0);
166  Changed = true;
167  }
168 
169  return Changed;
170 }
171 
172 class HotColdSplittingLegacyPass : public ModulePass {
173 public:
174  static char ID;
175  HotColdSplittingLegacyPass() : ModulePass(ID) {
177  }
178 
179  void getAnalysisUsage(AnalysisUsage &AU) const override {
184  }
185 
186  bool runOnModule(Module &M) override;
187 };
188 
189 } // end anonymous namespace
190 
191 /// Check whether \p F is inherently cold.
192 bool HotColdSplitting::isFunctionCold(const Function &F) const {
193  if (F.hasFnAttribute(Attribute::Cold))
194  return true;
195 
196  if (F.getCallingConv() == CallingConv::Cold)
197  return true;
198 
199  if (PSI->isFunctionEntryCold(&F))
200  return true;
201 
202  return false;
203 }
204 
205 // Returns false if the function should not be considered for hot-cold split
206 // optimization.
207 bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
208  if (F.hasFnAttribute(Attribute::AlwaysInline))
209  return false;
210 
211  if (F.hasFnAttribute(Attribute::NoInline))
212  return false;
213 
214  // A function marked `noreturn` may contain unreachable terminators: these
215  // should not be considered cold, as the function may be a trampoline.
216  if (F.hasFnAttribute(Attribute::NoReturn))
217  return false;
218 
219  if (F.hasFnAttribute(Attribute::SanitizeAddress) ||
220  F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
221  F.hasFnAttribute(Attribute::SanitizeThread) ||
222  F.hasFnAttribute(Attribute::SanitizeMemory))
223  return false;
224 
225  return true;
226 }
227 
228 /// Get the benefit score of outlining \p Region.
231  // Sum up the code size costs of non-terminator instructions. Tight coupling
232  // with \ref getOutliningPenalty is needed to model the costs of terminators.
233  InstructionCost Benefit = 0;
234  for (BasicBlock *BB : Region)
235  for (Instruction &I : BB->instructionsWithoutDebug())
236  if (&I != BB->getTerminator())
237  Benefit +=
239 
240  return Benefit;
241 }
242 
243 /// Get the penalty score for outlining \p Region.
245  unsigned NumInputs, unsigned NumOutputs) {
246  int Penalty = SplittingThreshold;
247  LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n");
248 
249  // If the splitting threshold is set at or below zero, skip the usual
250  // profitability check.
251  if (SplittingThreshold <= 0)
252  return Penalty;
253 
254  // Find the number of distinct exit blocks for the region. Use a conservative
255  // check to determine whether control returns from the region.
256  bool NoBlocksReturn = true;
257  SmallPtrSet<BasicBlock *, 2> SuccsOutsideRegion;
258  for (BasicBlock *BB : Region) {
259  // If a block has no successors, only assume it does not return if it's
260  // unreachable.
261  if (succ_empty(BB)) {
262  NoBlocksReturn &= isa<UnreachableInst>(BB->getTerminator());
263  continue;
264  }
265 
266  for (BasicBlock *SuccBB : successors(BB)) {
267  if (!is_contained(Region, SuccBB)) {
268  NoBlocksReturn = false;
269  SuccsOutsideRegion.insert(SuccBB);
270  }
271  }
272  }
273 
274  // Count the number of phis in exit blocks with >= 2 incoming values from the
275  // outlining region. These phis are split (\ref severSplitPHINodesOfExits),
276  // and new outputs are created to supply the split phis. CodeExtractor can't
277  // report these new outputs until extraction begins, but it's important to
278  // factor the cost of the outputs into the cost calculation.
279  unsigned NumSplitExitPhis = 0;
280  for (BasicBlock *ExitBB : SuccsOutsideRegion) {
281  for (PHINode &PN : ExitBB->phis()) {
282  // Find all incoming values from the outlining region.
283  int NumIncomingVals = 0;
284  for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
285  if (llvm::is_contained(Region, PN.getIncomingBlock(i))) {
286  ++NumIncomingVals;
287  if (NumIncomingVals > 1) {
288  ++NumSplitExitPhis;
289  break;
290  }
291  }
292  }
293  }
294 
295  // Apply a penalty for calling the split function. Factor in the cost of
296  // materializing all of the parameters.
297  int NumOutputsAndSplitPhis = NumOutputs + NumSplitExitPhis;
298  int NumParams = NumInputs + NumOutputsAndSplitPhis;
299  if (NumParams > MaxParametersForSplit) {
300  LLVM_DEBUG(dbgs() << NumInputs << " inputs and " << NumOutputsAndSplitPhis
301  << " outputs exceeds parameter limit ("
302  << MaxParametersForSplit << ")\n");
304  }
305  const int CostForArgMaterialization = 2 * TargetTransformInfo::TCC_Basic;
306  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumParams << " params\n");
307  Penalty += CostForArgMaterialization * NumParams;
308 
309  // Apply the typical code size cost for an output alloca and its associated
310  // reload in the caller. Also penalize the associated store in the callee.
311  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputsAndSplitPhis
312  << " outputs/split phis\n");
313  const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
314  Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
315 
316  // Apply a `noreturn` bonus.
317  if (NoBlocksReturn) {
318  LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
319  << " non-returning terminators\n");
320  Penalty -= Region.size();
321  }
322 
323  // Apply a penalty for having more than one successor outside of the region.
324  // This penalty accounts for the switch needed in the caller.
325  if (SuccsOutsideRegion.size() > 1) {
326  LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
327  << " non-region successors\n");
328  Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
329  }
330 
331  return Penalty;
332 }
333 
334 Function *HotColdSplitting::extractColdRegion(
335  const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC,
337  OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
338  assert(!Region.empty());
339 
340  // TODO: Pass BFI and BPI to update profile information.
341  CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
342  /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
343  /* AllowAlloca */ false, /* AllocaBlock */ nullptr,
344  /* Suffix */ "cold." + std::to_string(Count));
345 
346  // Perform a simple cost/benefit analysis to decide whether or not to permit
347  // splitting.
348  SetVector<Value *> Inputs, Outputs, Sinks;
349  CE.findInputsOutputs(Inputs, Outputs, Sinks);
350  InstructionCost OutliningBenefit = getOutliningBenefit(Region, TTI);
351  int OutliningPenalty =
352  getOutliningPenalty(Region, Inputs.size(), Outputs.size());
353  LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
354  << ", penalty = " << OutliningPenalty << "\n");
355  if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty)
356  return nullptr;
357 
358  Function *OrigF = Region[0]->getParent();
359  if (Function *OutF = CE.extractCodeRegion(CEAC)) {
360  User *U = *OutF->user_begin();
361  CallInst *CI = cast<CallInst>(U);
362  NumColdRegionsOutlined++;
363  if (TTI.useColdCCForColdCall(*OutF)) {
364  OutF->setCallingConv(CallingConv::Cold);
366  }
367  CI->setIsNoInline();
368 
369  if (EnableColdSection)
370  OutF->setSection(ColdSectionName);
371  else {
372  if (OrigF->hasSection())
373  OutF->setSection(OrigF->getSection());
374  }
375 
376  markFunctionCold(*OutF, BFI != nullptr);
377 
378  LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
379  ORE.emit([&]() {
380  return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
381  &*Region[0]->begin())
382  << ore::NV("Original", OrigF) << " split cold code into "
383  << ore::NV("Split", OutF);
384  });
385  return OutF;
386  }
387 
388  ORE.emit([&]() {
389  return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
390  &*Region[0]->begin())
391  << "Failed to extract region at block "
392  << ore::NV("Block", Region.front());
393  });
394  return nullptr;
395 }
396 
397 /// A pair of (basic block, score).
398 using BlockTy = std::pair<BasicBlock *, unsigned>;
399 
400 namespace {
401 /// A maximal outlining region. This contains all blocks post-dominated by a
402 /// sink block, the sink block itself, and all blocks dominated by the sink.
403 /// If sink-predecessors and sink-successors cannot be extracted in one region,
404 /// the static constructor returns a list of suitable extraction regions.
405 class OutliningRegion {
406  /// A list of (block, score) pairs. A block's score is non-zero iff it's a
407  /// viable sub-region entry point. Blocks with higher scores are better entry
408  /// points (i.e. they are more distant ancestors of the sink block).
409  SmallVector<BlockTy, 0> Blocks = {};
410 
411  /// The suggested entry point into the region. If the region has multiple
412  /// entry points, all blocks within the region may not be reachable from this
413  /// entry point.
414  BasicBlock *SuggestedEntryPoint = nullptr;
415 
416  /// Whether the entire function is cold.
417  bool EntireFunctionCold = false;
418 
419  /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise.
420  static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) {
421  return mayExtractBlock(BB) ? Score : 0;
422  }
423 
424  /// These scores should be lower than the score for predecessor blocks,
425  /// because regions starting at predecessor blocks are typically larger.
426  static constexpr unsigned ScoreForSuccBlock = 1;
427  static constexpr unsigned ScoreForSinkBlock = 1;
428 
429  OutliningRegion(const OutliningRegion &) = delete;
430  OutliningRegion &operator=(const OutliningRegion &) = delete;
431 
432 public:
433  OutliningRegion() = default;
434  OutliningRegion(OutliningRegion &&) = default;
435  OutliningRegion &operator=(OutliningRegion &&) = default;
436 
437  static std::vector<OutliningRegion> create(BasicBlock &SinkBB,
438  const DominatorTree &DT,
439  const PostDominatorTree &PDT) {
440  std::vector<OutliningRegion> Regions;
441  SmallPtrSet<BasicBlock *, 4> RegionBlocks;
442 
443  Regions.emplace_back();
444  OutliningRegion *ColdRegion = &Regions.back();
445 
446  auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) {
447  RegionBlocks.insert(BB);
448  ColdRegion->Blocks.emplace_back(BB, Score);
449  };
450 
451  // The ancestor farthest-away from SinkBB, and also post-dominated by it.
452  unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock);
453  ColdRegion->SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr;
454  unsigned BestScore = SinkScore;
455 
456  // Visit SinkBB's ancestors using inverse DFS.
457  auto PredIt = ++idf_begin(&SinkBB);
458  auto PredEnd = idf_end(&SinkBB);
459  while (PredIt != PredEnd) {
460  BasicBlock &PredBB = **PredIt;
461  bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
462 
463  // If the predecessor is cold and has no predecessors, the entire
464  // function must be cold.
465  if (SinkPostDom && pred_empty(&PredBB)) {
466  ColdRegion->EntireFunctionCold = true;
467  return Regions;
468  }
469 
470  // If SinkBB does not post-dominate a predecessor, do not mark the
471  // predecessor (or any of its predecessors) cold.
472  if (!SinkPostDom || !mayExtractBlock(PredBB)) {
473  PredIt.skipChildren();
474  continue;
475  }
476 
477  // Keep track of the post-dominated ancestor farthest away from the sink.
478  // The path length is always >= 2, ensuring that predecessor blocks are
479  // considered as entry points before the sink block.
480  unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength());
481  if (PredScore > BestScore) {
482  ColdRegion->SuggestedEntryPoint = &PredBB;
483  BestScore = PredScore;
484  }
485 
486  addBlockToRegion(&PredBB, PredScore);
487  ++PredIt;
488  }
489 
490  // If the sink can be added to the cold region, do so. It's considered as
491  // an entry point before any sink-successor blocks.
492  //
493  // Otherwise, split cold sink-successor blocks using a separate region.
494  // This satisfies the requirement that all extraction blocks other than the
495  // first have predecessors within the extraction region.
496  if (mayExtractBlock(SinkBB)) {
497  addBlockToRegion(&SinkBB, SinkScore);
498  if (pred_empty(&SinkBB)) {
499  ColdRegion->EntireFunctionCold = true;
500  return Regions;
501  }
502  } else {
503  Regions.emplace_back();
504  ColdRegion = &Regions.back();
505  BestScore = 0;
506  }
507 
508  // Find all successors of SinkBB dominated by SinkBB using DFS.
509  auto SuccIt = ++df_begin(&SinkBB);
510  auto SuccEnd = df_end(&SinkBB);
511  while (SuccIt != SuccEnd) {
512  BasicBlock &SuccBB = **SuccIt;
513  bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
514 
515  // Don't allow the backwards & forwards DFSes to mark the same block.
516  bool DuplicateBlock = RegionBlocks.count(&SuccBB);
517 
518  // If SinkBB does not dominate a successor, do not mark the successor (or
519  // any of its successors) cold.
520  if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) {
521  SuccIt.skipChildren();
522  continue;
523  }
524 
525  unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock);
526  if (SuccScore > BestScore) {
527  ColdRegion->SuggestedEntryPoint = &SuccBB;
528  BestScore = SuccScore;
529  }
530 
531  addBlockToRegion(&SuccBB, SuccScore);
532  ++SuccIt;
533  }
534 
535  return Regions;
536  }
537 
538  /// Whether this region has nothing to extract.
539  bool empty() const { return !SuggestedEntryPoint; }
540 
541  /// The blocks in this region.
542  ArrayRef<std::pair<BasicBlock *, unsigned>> blocks() const { return Blocks; }
543 
544  /// Whether the entire function containing this region is cold.
545  bool isEntireFunctionCold() const { return EntireFunctionCold; }
546 
547  /// Remove a sub-region from this region and return it as a block sequence.
548  BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) {
549  assert(!empty() && !isEntireFunctionCold() && "Nothing to extract");
550 
551  // Remove blocks dominated by the suggested entry point from this region.
552  // During the removal, identify the next best entry point into the region.
553  // Ensure that the first extracted block is the suggested entry point.
554  BlockSequence SubRegion = {SuggestedEntryPoint};
555  BasicBlock *NextEntryPoint = nullptr;
556  unsigned NextScore = 0;
557  auto RegionEndIt = Blocks.end();
558  auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) {
559  BasicBlock *BB = Block.first;
560  unsigned Score = Block.second;
561  bool InSubRegion =
562  BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB);
563  if (!InSubRegion && Score > NextScore) {
564  NextEntryPoint = BB;
565  NextScore = Score;
566  }
567  if (InSubRegion && BB != SuggestedEntryPoint)
568  SubRegion.push_back(BB);
569  return InSubRegion;
570  });
571  Blocks.erase(RegionStartIt, RegionEndIt);
572 
573  // Update the suggested entry point.
574  SuggestedEntryPoint = NextEntryPoint;
575 
576  return SubRegion;
577  }
578 };
579 } // namespace
580 
581 bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
582  bool Changed = false;
583 
584  // The set of cold blocks.
585  SmallPtrSet<BasicBlock *, 4> ColdBlocks;
586 
587  // The worklist of non-intersecting regions left to outline.
588  SmallVector<OutliningRegion, 2> OutliningWorklist;
589 
590  // Set up an RPO traversal. Experimentally, this performs better (outlines
591  // more) than a PO traversal, because we prevent region overlap by keeping
592  // the first region to contain a block.
594 
595  // Calculate domtrees lazily. This reduces compile-time significantly.
596  std::unique_ptr<DominatorTree> DT;
597  std::unique_ptr<PostDominatorTree> PDT;
598 
599  // Calculate BFI lazily (it's only used to query ProfileSummaryInfo). This
600  // reduces compile-time significantly. TODO: When we *do* use BFI, we should
601  // be able to salvage its domtrees instead of recomputing them.
602  BlockFrequencyInfo *BFI = nullptr;
603  if (HasProfileSummary)
604  BFI = GetBFI(F);
605 
606  TargetTransformInfo &TTI = GetTTI(F);
607  OptimizationRemarkEmitter &ORE = (*GetORE)(F);
608  AssumptionCache *AC = LookupAC(F);
609 
610  // Find all cold regions.
611  for (BasicBlock *BB : RPOT) {
612  // This block is already part of some outlining region.
613  if (ColdBlocks.count(BB))
614  continue;
615 
616  bool Cold = (BFI && PSI->isColdBlock(BB, BFI)) ||
617  (EnableStaticAnalysis && unlikelyExecuted(*BB));
618  if (!Cold)
619  continue;
620 
621  LLVM_DEBUG({
622  dbgs() << "Found a cold block:\n";
623  BB->dump();
624  });
625 
626  if (!DT)
627  DT = std::make_unique<DominatorTree>(F);
628  if (!PDT)
629  PDT = std::make_unique<PostDominatorTree>(F);
630 
631  auto Regions = OutliningRegion::create(*BB, *DT, *PDT);
632  for (OutliningRegion &Region : Regions) {
633  if (Region.empty())
634  continue;
635 
636  if (Region.isEntireFunctionCold()) {
637  LLVM_DEBUG(dbgs() << "Entire function is cold\n");
638  return markFunctionCold(F);
639  }
640 
641  // If this outlining region intersects with another, drop the new region.
642  //
643  // TODO: It's theoretically possible to outline more by only keeping the
644  // largest region which contains a block, but the extra bookkeeping to do
645  // this is tricky/expensive.
646  bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
647  return !ColdBlocks.insert(Block.first).second;
648  });
649  if (RegionsOverlap)
650  continue;
651 
652  OutliningWorklist.emplace_back(std::move(Region));
653  ++NumColdRegionsFound;
654  }
655  }
656 
657  if (OutliningWorklist.empty())
658  return Changed;
659 
660  // Outline single-entry cold regions, splitting up larger regions as needed.
661  unsigned OutlinedFunctionID = 1;
662  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
664  do {
665  OutliningRegion Region = OutliningWorklist.pop_back_val();
666  assert(!Region.empty() && "Empty outlining region in worklist");
667  do {
668  BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
669  LLVM_DEBUG({
670  dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
671  for (BasicBlock *BB : SubRegion)
672  BB->dump();
673  });
674 
675  Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
676  ORE, AC, OutlinedFunctionID);
677  if (Outlined) {
678  ++OutlinedFunctionID;
679  Changed = true;
680  }
681  } while (!Region.empty());
682  } while (!OutliningWorklist.empty());
683 
684  return Changed;
685 }
686 
688  bool Changed = false;
689  bool HasProfileSummary = (M.getProfileSummary(/* IsCS */ false) != nullptr);
690  for (Function &F : M) {
691  // Do not touch declarations.
692  if (F.isDeclaration())
693  continue;
694 
695  // Do not modify `optnone` functions.
696  if (F.hasOptNone())
697  continue;
698 
699  // Detect inherently cold functions and mark them as such.
700  if (isFunctionCold(F)) {
701  Changed |= markFunctionCold(F);
702  continue;
703  }
704 
705  if (!shouldOutlineFrom(F)) {
706  LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n");
707  continue;
708  }
709 
710  LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
711  Changed |= outlineColdRegions(F, HasProfileSummary);
712  }
713  return Changed;
714 }
715 
716 bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
717  if (skipModule(M))
718  return false;
719  ProfileSummaryInfo *PSI =
720  &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
721  auto GTTI = [this](Function &F) -> TargetTransformInfo & {
722  return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
723  };
724  auto GBFI = [this](Function &F) {
725  return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
726  };
727  std::unique_ptr<OptimizationRemarkEmitter> ORE;
729  [&ORE](Function &F) -> OptimizationRemarkEmitter & {
730  ORE.reset(new OptimizationRemarkEmitter(&F));
731  return *ORE;
732  };
733  auto LookupAC = [this](Function &F) -> AssumptionCache * {
734  if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
735  return ACT->lookupAssumptionCache(F);
736  return nullptr;
737  };
738 
739  return HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M);
740 }
741 
744  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
745 
746  auto LookupAC = [&FAM](Function &F) -> AssumptionCache * {
748  };
749 
750  auto GBFI = [&FAM](Function &F) {
752  };
753 
755  [&FAM](Function &F) -> TargetTransformInfo & {
756  return FAM.getResult<TargetIRAnalysis>(F);
757  };
758 
759  std::unique_ptr<OptimizationRemarkEmitter> ORE;
761  [&ORE](Function &F) -> OptimizationRemarkEmitter & {
762  ORE.reset(new OptimizationRemarkEmitter(&F));
763  return *ORE;
764  };
765 
767 
768  if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M))
769  return PreservedAnalyses::none();
770  return PreservedAnalyses::all();
771 }
772 
774 INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit",
775  "Hot Cold Splitting", false, false)
778 INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit",
779  "Hot Cold Splitting", false, false)
780 
782  return new HotColdSplittingLegacyPass();
783 }
i
i
Definition: README.txt:29
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
AssumptionCache.h
llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition: TargetTransformInfo.h:2570
llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition: DiagnosticInfo.h:735
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::SmallVectorImpl::erase
iterator erase(const_iterator CI)
Definition: SmallVector.h:723
llvm::TargetTransformInfo::getInstructionCost
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
Definition: TargetTransformInfo.cpp:224
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::ModulePass
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:248
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
llvm::Function
Definition: Function.h:60
BlockTy
std::pair< BasicBlock *, unsigned > BlockTy
A pair of (basic block, score).
Definition: HotColdSplitting.cpp:398
Pass.h
llvm::BlockFrequencyInfoWrapperPass
Legacy analysis pass which computes BlockFrequencyInfo.
Definition: BlockFrequencyInfo.h:138
llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
llvm::GlobalObject::getSection
StringRef getSection() const
Get the custom section of this global if it has one.
Definition: GlobalObject.h:111
llvm::CallBase::setIsNoInline
void setIsNoInline()
Definition: InstrTypes.h:1849
ColdSectionName
static cl::opt< std::string > ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"), cl::Hidden, cl::desc("Name for the section containing cold functions " "extracted by hot-cold splitting."))
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1181
Statistic.h
llvm::HotColdSplittingPass::run
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
Definition: HotColdSplitting.cpp:743
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:172
llvm::df_end
df_iterator< T > df_end(const T &G)
Definition: DepthFirstIterator.h:224
OptimizationRemarkEmitter.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
FAM
FunctionAnalysisManager FAM
Definition: PassBuilderBindings.cpp:59
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:220
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
llvm::PreservedAnalyses::none
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:155
llvm::TargetTransformInfo::useColdCCForColdCall
bool useColdCCForColdCall(Function &F) const
Return true if the input function which is cold at all call sites, should use coldcc calling conventi...
Definition: TargetTransformInfo.cpp:507
Module.h
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
llvm::ProfileSummaryInfo::isFunctionEntryCold
bool isFunctionEntryCold(const Function *F) const
Returns true if F has cold function entry.
Definition: ProfileSummaryInfo.cpp:221
llvm::HotColdSplitting
Definition: HotColdSplitting.h:33
HotColdSplitting.h
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:136
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
CodeExtractor.h
llvm::successors
auto successors(MachineBasicBlock *BB)
Definition: MachineSSAContext.h:29
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::HotColdSplitting::run
bool run(Module &M)
Definition: HotColdSplitting.cpp:687
llvm::succ_empty
bool succ_empty(const Instruction *I)
Definition: CFG.h:255
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::DominatorTree::dominates
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
llvm::idf_begin
idf_iterator< T > idf_begin(const T &G)
Definition: DepthFirstIterator.h:268
llvm::remove_if
auto remove_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1629
Instruction.h
CommandLine.h
SplittingThreshold
static cl::opt< int > SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden, cl::desc("Base penalty for splitting cold code (as a " "multiple of TCC_Basic)"))
llvm::idf_end
idf_iterator< T > idf_end(const T &G)
Definition: DepthFirstIterator.h:273
llvm::BlockFrequencyInfo
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Definition: BlockFrequencyInfo.h:37
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:24
PostDominators.h
llvm::User
Definition: User.h:44
DEBUG_TYPE
#define DEBUG_TYPE
Definition: HotColdSplitting.cpp:61
hotcoldsplit
hotcoldsplit
Definition: HotColdSplitting.cpp:778
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
false
Definition: StackSlotColoring.cpp:141
EnableStaticAnalysis
static cl::opt< bool > EnableStaticAnalysis("hot-cold-static-analysis", cl::init(true), cl::Hidden)
llvm::Instruction
Definition: Instruction.h:42
llvm::BlockFrequencyAnalysis
Analysis pass which computes BlockFrequencyInfo.
Definition: BlockFrequencyInfo.h:112
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::createHotColdSplittingPass
ModulePass * createHotColdSplittingPass()
createHotColdSplittingPass - This pass outlines cold blocks into a separate function(s).
Definition: HotColdSplitting.cpp:781
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
CFG.h
llvm::ProfileSummaryInfo
Analysis providing profile information.
Definition: ProfileSummaryInfo.h:39
Splitting
Hot Cold Splitting
Definition: HotColdSplitting.cpp:779
llvm::RegionBase::blocks
block_range blocks()
Returns a range view of the basic blocks in the region.
Definition: RegionInfo.h:622
BasicBlock.h
llvm::cl::opt< bool >
llvm::GlobalObject::hasSection
bool hasSection() const
Check if this global has a custom object file section.
Definition: GlobalObject.h:103
getOutliningBenefit
static InstructionCost getOutliningBenefit(ArrayRef< BasicBlock * > Region, TargetTransformInfo &TTI)
Get the benefit score of outlining Region.
Definition: HotColdSplitting.cpp:229
llvm::initializeHotColdSplittingLegacyPassPass
void initializeHotColdSplittingLegacyPassPass(PassRegistry &)
ProfileSummaryInfo.h
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2626
llvm::AssumptionAnalysis
A function analysis which provides an AssumptionCache.
Definition: AssumptionCache.h:173
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
IPO.h
EnableColdSection
static cl::opt< bool > EnableColdSection("enable-cold-section", cl::init(false), cl::Hidden, cl::desc("Enable placement of extracted cold functions" " into a separate section after hot-cold splitting."))
move
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
Definition: README.txt:546
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:440
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit", "Hot Cold Splitting", false, false) INITIALIZE_PASS_END(HotColdSplittingLegacyPass
llvm::ProfileSummaryInfoWrapperPass
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:193
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1673
llvm::df_begin
df_iterator< T > df_begin(const T &G)
Definition: DepthFirstIterator.h:219
getOutliningPenalty
static int getOutliningPenalty(ArrayRef< BasicBlock * > Region, unsigned NumInputs, unsigned NumOutputs)
Get the penalty score for outlining Region.
Definition: HotColdSplitting.cpp:244
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:77
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:82
llvm::WinEH::EncodingType::CE
@ CE
Windows NT (Windows on ARM)
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
llvm::ProfileSummaryAnalysis
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:211
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:202
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1597
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::PostDominatorTree
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Definition: PostDominators.h:28
llvm::CodeExtractor
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
llvm::pred_empty
bool pred_empty(const BasicBlock *BB)
Definition: CFG.h:118
llvm::InstructionCost::isValid
bool isValid() const
Definition: InstructionCost.h:79
BlockFrequencyInfo.h
llvm::AMDGPUISD::BFI
@ BFI
Definition: AMDGPUISelLowering.h:429
STATISTIC
STATISTIC(NumColdRegionsFound, "Number of cold regions found.")
llvm::empty
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:256
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
llvm::AnalysisManager::getCachedResult
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:793
DiagnosticInfo.h
Function.h
PassManager.h
llvm::CodeExtractorAnalysisCache
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
llvm::ReversePostOrderTraversal
Definition: PostOrderIterator.h:291
llvm::ProfileSummaryInfo::isColdBlock
bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) const
Returns true if BasicBlock BB is considered cold.
Definition: ProfileSummaryInfo.cpp:331
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:690
Instructions.h
MaxParametersForSplit
static cl::opt< int > MaxParametersForSplit("hotcoldsplit-max-params", cl::init(4), cl::Hidden, cl::desc("Maximum number of parameters for a split function"))
PostOrderIterator.h
SmallVector.h
User.h
Dominators.h
llvm::SmallVectorImpl::pop_back_val
T pop_back_val()
Definition: SmallVector.h:659
llvm::to_string
std::string to_string(const T &Value)
Definition: ScopedPrinter.h:85
TargetTransformInfo.h
llvm::PHINode
Definition: Instructions.h:2699
blockEndsInUnreachable
static bool blockEndsInUnreachable(const MachineBasicBlock *MBB)
A no successor, non-return block probably ends in unreachable and is cold.
Definition: BranchFolding.cpp:519
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:42
llvm::AnalysisUsage::addUsedIfAvailable
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
Definition: PassAnalysisSupport.h:117
llvm::InnerAnalysisManagerProxy
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:931
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1474
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:244
llvm::cl::desc
Definition: CommandLine.h:413
llvm::Region
Definition: RegionInfo.h:889
llvm::M68kBeads::Term
@ Term
Definition: M68kBaseInfo.h:71
raw_ostream.h
llvm::SetVector< Value * >
llvm::CallingConv::Cold
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
llvm::PostDominatorTree::dominates
bool dominates(const Instruction *I1, const Instruction *I2) const
Return true if I1 dominates I2.
Definition: PostDominators.cpp:54
Value.h
InitializePasses.h
Debug.h
llvm::CallBase::setCallingConv
void setCallingConv(CallingConv::ID CC)
Definition: InstrTypes.h:1459
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:923
llvm::RegionBase::getParent
RegionT * getParent() const
Get the parent of the Region.
Definition: RegionInfo.h:364
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38