LLVM  10.0.0svn
HotColdSplitting.cpp
Go to the documentation of this file.
1 //===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// The goal of hot/cold splitting is to improve the memory locality of code.
11 /// The splitting pass does this by identifying cold blocks and moving them into
12 /// separate functions.
13 ///
14 /// When the splitting pass finds a cold block (referred to as "the sink"), it
15 /// grows a maximal cold region around that block. The maximal region contains
16 /// all blocks (post-)dominated by the sink [*]. In theory, these blocks are as
17 /// cold as the sink. Once a region is found, it's split out of the original
18 /// function provided it's profitable to do so.
19 ///
20 /// [*] In practice, there is some added complexity because some blocks are not
21 /// safe to extract.
22 ///
23 /// TODO: Use the PM to get domtrees, and preserve BFI/BPI.
24 /// TODO: Reorder outlined functions.
25 ///
26 //===----------------------------------------------------------------------===//
27 
29 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/ADT/Statistic.h"
34 #include "llvm/Analysis/CFG.h"
39 #include "llvm/IR/BasicBlock.h"
40 #include "llvm/IR/CFG.h"
41 #include "llvm/IR/CallSite.h"
42 #include "llvm/IR/DataLayout.h"
43 #include "llvm/IR/DiagnosticInfo.h"
44 #include "llvm/IR/Dominators.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/Instruction.h"
47 #include "llvm/IR/Instructions.h"
48 #include "llvm/IR/IntrinsicInst.h"
49 #include "llvm/IR/Metadata.h"
50 #include "llvm/IR/Module.h"
51 #include "llvm/IR/PassManager.h"
52 #include "llvm/IR/Type.h"
53 #include "llvm/IR/Use.h"
54 #include "llvm/IR/User.h"
55 #include "llvm/IR/Value.h"
56 #include "llvm/Pass.h"
59 #include "llvm/Support/Debug.h"
61 #include "llvm/Transforms/IPO.h"
63 #include "llvm/Transforms/Scalar.h"
69 #include <algorithm>
70 #include <cassert>
71 
72 #define DEBUG_TYPE "hotcoldsplit"
73 
74 STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
75 STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
76 
77 using namespace llvm;
78 
79 static cl::opt<bool> EnableStaticAnalyis("hot-cold-static-analysis",
80  cl::init(true), cl::Hidden);
81 
82 static cl::opt<int>
83  SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden,
84  cl::desc("Base penalty for splitting cold code (as a "
85  "multiple of TCC_Basic)"));
86 
87 namespace {
88 
89 /// A sequence of basic blocks.
90 ///
91 /// A 0-sized SmallVector is slightly cheaper to move than a std::vector.
92 using BlockSequence = SmallVector<BasicBlock *, 0>;
93 
94 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
95 // this function unless you modify the MBB version as well.
96 //
97 /// A no successor, non-return block probably ends in unreachable and is cold.
98 /// Also consider a block that ends in an indirect branch to be a return block,
99 /// since many targets use plain indirect branches to return.
100 bool blockEndsInUnreachable(const BasicBlock &BB) {
101  if (!succ_empty(&BB))
102  return false;
103  if (BB.empty())
104  return true;
105  const Instruction *I = BB.getTerminator();
106  return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
107 }
108 
109 bool unlikelyExecuted(BasicBlock &BB) {
110  // Exception handling blocks are unlikely executed.
111  if (BB.isEHPad() || isa<ResumeInst>(BB.getTerminator()))
112  return true;
113 
114  // The block is cold if it calls/invokes a cold function. However, do not
115  // mark sanitizer traps as cold.
116  for (Instruction &I : BB)
117  if (auto CS = CallSite(&I))
118  if (CS.hasFnAttr(Attribute::Cold) && !CS->getMetadata("nosanitize"))
119  return true;
120 
121  // The block is cold if it has an unreachable terminator, unless it's
122  // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp).
123  if (blockEndsInUnreachable(BB)) {
124  if (auto *CI =
125  dyn_cast_or_null<CallInst>(BB.getTerminator()->getPrevNode()))
126  if (CI->hasFnAttr(Attribute::NoReturn))
127  return false;
128  return true;
129  }
130 
131  return false;
132 }
133 
134 /// Check whether it's safe to outline \p BB.
135 static bool mayExtractBlock(const BasicBlock &BB) {
136  // EH pads are unsafe to outline because doing so breaks EH type tables. It
137  // follows that invoke instructions cannot be extracted, because CodeExtractor
138  // requires unwind destinations to be within the extraction region.
139  //
140  // Resumes that are not reachable from a cleanup landing pad are considered to
141  // be unreachable. It’s not safe to split them out either.
142  auto Term = BB.getTerminator();
143  return !BB.hasAddressTaken() && !BB.isEHPad() && !isa<InvokeInst>(Term) &&
144  !isa<ResumeInst>(Term);
145 }
146 
147 /// Mark \p F cold. Based on this assumption, also optimize it for minimum size.
148 /// If \p UpdateEntryCount is true (set when this is a new split function and
149 /// module has profile data), set entry count to 0 to ensure treated as cold.
150 /// Return true if the function is changed.
151 static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) {
152  assert(!F.hasOptNone() && "Can't mark this cold");
153  bool Changed = false;
156  Changed = true;
157  }
158  if (!F.hasFnAttribute(Attribute::MinSize)) {
159  F.addFnAttr(Attribute::MinSize);
160  Changed = true;
161  }
162  if (UpdateEntryCount) {
163  // Set the entry count to 0 to ensure it is placed in the unlikely text
164  // section when function sections are enabled.
165  F.setEntryCount(0);
166  Changed = true;
167  }
168 
169  return Changed;
170 }
171 
172 class HotColdSplitting {
173 public:
174  HotColdSplitting(ProfileSummaryInfo *ProfSI,
179  : PSI(ProfSI), GetBFI(GBFI), GetTTI(GTTI), GetORE(GORE), LookupAC(LAC) {}
180  bool run(Module &M);
181 
182 private:
183  bool isFunctionCold(const Function &F) const;
184  bool shouldOutlineFrom(const Function &F) const;
185  bool outlineColdRegions(Function &F, bool HasProfileSummary);
186  Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT,
189  AssumptionCache *AC, unsigned Count);
190  ProfileSummaryInfo *PSI;
193  std::function<OptimizationRemarkEmitter &(Function &)> *GetORE;
195 };
196 
197 class HotColdSplittingLegacyPass : public ModulePass {
198 public:
199  static char ID;
200  HotColdSplittingLegacyPass() : ModulePass(ID) {
202  }
203 
204  void getAnalysisUsage(AnalysisUsage &AU) const override {
209  }
210 
211  bool runOnModule(Module &M) override;
212 };
213 
214 } // end anonymous namespace
215 
216 /// Check whether \p F is inherently cold.
217 bool HotColdSplitting::isFunctionCold(const Function &F) const {
219  return true;
220 
222  return true;
223 
224  if (PSI->isFunctionEntryCold(&F))
225  return true;
226 
227  return false;
228 }
229 
230 // Returns false if the function should not be considered for hot-cold split
231 // optimization.
232 bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
233  if (F.hasFnAttribute(Attribute::AlwaysInline))
234  return false;
235 
236  if (F.hasFnAttribute(Attribute::NoInline))
237  return false;
238 
239  if (F.hasFnAttribute(Attribute::SanitizeAddress) ||
240  F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
241  F.hasFnAttribute(Attribute::SanitizeThread) ||
242  F.hasFnAttribute(Attribute::SanitizeMemory))
243  return false;
244 
245  return true;
246 }
247 
248 /// Get the benefit score of outlining \p Region.
250  TargetTransformInfo &TTI) {
251  // Sum up the code size costs of non-terminator instructions. Tight coupling
252  // with \ref getOutliningPenalty is needed to model the costs of terminators.
253  int Benefit = 0;
254  for (BasicBlock *BB : Region)
255  for (Instruction &I : BB->instructionsWithoutDebug())
256  if (&I != BB->getTerminator())
257  Benefit +=
259 
260  return Benefit;
261 }
262 
263 /// Get the penalty score for outlining \p Region.
265  unsigned NumInputs, unsigned NumOutputs) {
266  int Penalty = SplittingThreshold;
267  LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n");
268 
269  // If the splitting threshold is set at or below zero, skip the usual
270  // profitability check.
271  if (SplittingThreshold <= 0)
272  return Penalty;
273 
274  // The typical code size cost for materializing an argument for the outlined
275  // call.
276  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumInputs << " inputs\n");
277  const int CostForArgMaterialization = TargetTransformInfo::TCC_Basic;
278  Penalty += CostForArgMaterialization * NumInputs;
279 
280  // The typical code size cost for an output alloca, its associated store, and
281  // its associated reload.
282  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputs << " outputs\n");
283  const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
284  Penalty += CostForRegionOutput * NumOutputs;
285 
286  // Find the number of distinct exit blocks for the region. Use a conservative
287  // check to determine whether control returns from the region.
288  bool NoBlocksReturn = true;
289  SmallPtrSet<BasicBlock *, 2> SuccsOutsideRegion;
290  for (BasicBlock *BB : Region) {
291  // If a block has no successors, only assume it does not return if it's
292  // unreachable.
293  if (succ_empty(BB)) {
294  NoBlocksReturn &= isa<UnreachableInst>(BB->getTerminator());
295  continue;
296  }
297 
298  for (BasicBlock *SuccBB : successors(BB)) {
299  if (find(Region, SuccBB) == Region.end()) {
300  NoBlocksReturn = false;
301  SuccsOutsideRegion.insert(SuccBB);
302  }
303  }
304  }
305 
306  // Apply a `noreturn` bonus.
307  if (NoBlocksReturn) {
308  LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
309  << " non-returning terminators\n");
310  Penalty -= Region.size();
311  }
312 
313  // Apply a penalty for having more than one successor outside of the region.
314  // This penalty accounts for the switch needed in the caller.
315  if (!SuccsOutsideRegion.empty()) {
316  LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
317  << " non-region successors\n");
318  Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
319  }
320 
321  return Penalty;
322 }
323 
324 Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
325  DominatorTree &DT,
327  TargetTransformInfo &TTI,
329  AssumptionCache *AC,
330  unsigned Count) {
331  assert(!Region.empty());
332 
333  // TODO: Pass BFI and BPI to update profile information.
334  CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
335  /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
336  /* AllowAlloca */ false,
337  /* Suffix */ "cold." + std::to_string(Count));
338 
339  // Perform a simple cost/benefit analysis to decide whether or not to permit
340  // splitting.
341  SetVector<Value *> Inputs, Outputs, Sinks;
342  CE.findInputsOutputs(Inputs, Outputs, Sinks);
343  int OutliningBenefit = getOutliningBenefit(Region, TTI);
344  int OutliningPenalty =
345  getOutliningPenalty(Region, Inputs.size(), Outputs.size());
346  LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
347  << ", penalty = " << OutliningPenalty << "\n");
348  if (OutliningBenefit <= OutliningPenalty)
349  return nullptr;
350 
351  Function *OrigF = Region[0]->getParent();
352  if (Function *OutF = CE.extractCodeRegion()) {
353  User *U = *OutF->user_begin();
354  CallInst *CI = cast<CallInst>(U);
355  CallSite CS(CI);
356  NumColdRegionsOutlined++;
357  if (TTI.useColdCCForColdCall(*OutF)) {
358  OutF->setCallingConv(CallingConv::Cold);
359  CS.setCallingConv(CallingConv::Cold);
360  }
361  CI->setIsNoInline();
362 
363  markFunctionCold(*OutF, BFI != nullptr);
364 
365  LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
366  ORE.emit([&]() {
367  return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
368  &*Region[0]->begin())
369  << ore::NV("Original", OrigF) << " split cold code into "
370  << ore::NV("Split", OutF);
371  });
372  return OutF;
373  }
374 
375  ORE.emit([&]() {
376  return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
377  &*Region[0]->begin())
378  << "Failed to extract region at block "
379  << ore::NV("Block", Region.front());
380  });
381  return nullptr;
382 }
383 
384 /// A pair of (basic block, score).
385 using BlockTy = std::pair<BasicBlock *, unsigned>;
386 
387 namespace {
388 /// A maximal outlining region. This contains all blocks post-dominated by a
389 /// sink block, the sink block itself, and all blocks dominated by the sink.
390 /// If sink-predecessors and sink-successors cannot be extracted in one region,
391 /// the static constructor returns a list of suitable extraction regions.
392 class OutliningRegion {
393  /// A list of (block, score) pairs. A block's score is non-zero iff it's a
394  /// viable sub-region entry point. Blocks with higher scores are better entry
395  /// points (i.e. they are more distant ancestors of the sink block).
396  SmallVector<BlockTy, 0> Blocks = {};
397 
398  /// The suggested entry point into the region. If the region has multiple
399  /// entry points, all blocks within the region may not be reachable from this
400  /// entry point.
401  BasicBlock *SuggestedEntryPoint = nullptr;
402 
403  /// Whether the entire function is cold.
404  bool EntireFunctionCold = false;
405 
406  /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise.
407  static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) {
408  return mayExtractBlock(BB) ? Score : 0;
409  }
410 
411  /// These scores should be lower than the score for predecessor blocks,
412  /// because regions starting at predecessor blocks are typically larger.
413  static constexpr unsigned ScoreForSuccBlock = 1;
414  static constexpr unsigned ScoreForSinkBlock = 1;
415 
416  OutliningRegion(const OutliningRegion &) = delete;
417  OutliningRegion &operator=(const OutliningRegion &) = delete;
418 
419 public:
420  OutliningRegion() = default;
421  OutliningRegion(OutliningRegion &&) = default;
422  OutliningRegion &operator=(OutliningRegion &&) = default;
423 
424  static std::vector<OutliningRegion> create(BasicBlock &SinkBB,
425  const DominatorTree &DT,
426  const PostDominatorTree &PDT) {
427  std::vector<OutliningRegion> Regions;
428  SmallPtrSet<BasicBlock *, 4> RegionBlocks;
429 
430  Regions.emplace_back();
431  OutliningRegion *ColdRegion = &Regions.back();
432 
433  auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) {
434  RegionBlocks.insert(BB);
435  ColdRegion->Blocks.emplace_back(BB, Score);
436  };
437 
438  // The ancestor farthest-away from SinkBB, and also post-dominated by it.
439  unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock);
440  ColdRegion->SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr;
441  unsigned BestScore = SinkScore;
442 
443  // Visit SinkBB's ancestors using inverse DFS.
444  auto PredIt = ++idf_begin(&SinkBB);
445  auto PredEnd = idf_end(&SinkBB);
446  while (PredIt != PredEnd) {
447  BasicBlock &PredBB = **PredIt;
448  bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
449 
450  // If the predecessor is cold and has no predecessors, the entire
451  // function must be cold.
452  if (SinkPostDom && pred_empty(&PredBB)) {
453  ColdRegion->EntireFunctionCold = true;
454  return Regions;
455  }
456 
457  // If SinkBB does not post-dominate a predecessor, do not mark the
458  // predecessor (or any of its predecessors) cold.
459  if (!SinkPostDom || !mayExtractBlock(PredBB)) {
460  PredIt.skipChildren();
461  continue;
462  }
463 
464  // Keep track of the post-dominated ancestor farthest away from the sink.
465  // The path length is always >= 2, ensuring that predecessor blocks are
466  // considered as entry points before the sink block.
467  unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength());
468  if (PredScore > BestScore) {
469  ColdRegion->SuggestedEntryPoint = &PredBB;
470  BestScore = PredScore;
471  }
472 
473  addBlockToRegion(&PredBB, PredScore);
474  ++PredIt;
475  }
476 
477  // If the sink can be added to the cold region, do so. It's considered as
478  // an entry point before any sink-successor blocks.
479  //
480  // Otherwise, split cold sink-successor blocks using a separate region.
481  // This satisfies the requirement that all extraction blocks other than the
482  // first have predecessors within the extraction region.
483  if (mayExtractBlock(SinkBB)) {
484  addBlockToRegion(&SinkBB, SinkScore);
485  } else {
486  Regions.emplace_back();
487  ColdRegion = &Regions.back();
488  BestScore = 0;
489  }
490 
491  // Find all successors of SinkBB dominated by SinkBB using DFS.
492  auto SuccIt = ++df_begin(&SinkBB);
493  auto SuccEnd = df_end(&SinkBB);
494  while (SuccIt != SuccEnd) {
495  BasicBlock &SuccBB = **SuccIt;
496  bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
497 
498  // Don't allow the backwards & forwards DFSes to mark the same block.
499  bool DuplicateBlock = RegionBlocks.count(&SuccBB);
500 
501  // If SinkBB does not dominate a successor, do not mark the successor (or
502  // any of its successors) cold.
503  if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) {
504  SuccIt.skipChildren();
505  continue;
506  }
507 
508  unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock);
509  if (SuccScore > BestScore) {
510  ColdRegion->SuggestedEntryPoint = &SuccBB;
511  BestScore = SuccScore;
512  }
513 
514  addBlockToRegion(&SuccBB, SuccScore);
515  ++SuccIt;
516  }
517 
518  return Regions;
519  }
520 
521  /// Whether this region has nothing to extract.
522  bool empty() const { return !SuggestedEntryPoint; }
523 
524  /// The blocks in this region.
525  ArrayRef<std::pair<BasicBlock *, unsigned>> blocks() const { return Blocks; }
526 
527  /// Whether the entire function containing this region is cold.
528  bool isEntireFunctionCold() const { return EntireFunctionCold; }
529 
530  /// Remove a sub-region from this region and return it as a block sequence.
531  BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) {
532  assert(!empty() && !isEntireFunctionCold() && "Nothing to extract");
533 
534  // Remove blocks dominated by the suggested entry point from this region.
535  // During the removal, identify the next best entry point into the region.
536  // Ensure that the first extracted block is the suggested entry point.
537  BlockSequence SubRegion = {SuggestedEntryPoint};
538  BasicBlock *NextEntryPoint = nullptr;
539  unsigned NextScore = 0;
540  auto RegionEndIt = Blocks.end();
541  auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) {
542  BasicBlock *BB = Block.first;
543  unsigned Score = Block.second;
544  bool InSubRegion =
545  BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB);
546  if (!InSubRegion && Score > NextScore) {
547  NextEntryPoint = BB;
548  NextScore = Score;
549  }
550  if (InSubRegion && BB != SuggestedEntryPoint)
551  SubRegion.push_back(BB);
552  return InSubRegion;
553  });
554  Blocks.erase(RegionStartIt, RegionEndIt);
555 
556  // Update the suggested entry point.
557  SuggestedEntryPoint = NextEntryPoint;
558 
559  return SubRegion;
560  }
561 };
562 } // namespace
563 
564 bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
565  bool Changed = false;
566 
567  // The set of cold blocks.
568  SmallPtrSet<BasicBlock *, 4> ColdBlocks;
569 
570  // The worklist of non-intersecting regions left to outline.
571  SmallVector<OutliningRegion, 2> OutliningWorklist;
572 
573  // Set up an RPO traversal. Experimentally, this performs better (outlines
574  // more) than a PO traversal, because we prevent region overlap by keeping
575  // the first region to contain a block.
577 
578  // Calculate domtrees lazily. This reduces compile-time significantly.
579  std::unique_ptr<DominatorTree> DT;
580  std::unique_ptr<PostDominatorTree> PDT;
581 
582  // Calculate BFI lazily (it's only used to query ProfileSummaryInfo). This
583  // reduces compile-time significantly. TODO: When we *do* use BFI, we should
584  // be able to salvage its domtrees instead of recomputing them.
585  BlockFrequencyInfo *BFI = nullptr;
586  if (HasProfileSummary)
587  BFI = GetBFI(F);
588 
589  TargetTransformInfo &TTI = GetTTI(F);
590  OptimizationRemarkEmitter &ORE = (*GetORE)(F);
591  AssumptionCache *AC = LookupAC(F);
592 
593  // Find all cold regions.
594  for (BasicBlock *BB : RPOT) {
595  // This block is already part of some outlining region.
596  if (ColdBlocks.count(BB))
597  continue;
598 
599  bool Cold = (BFI && PSI->isColdBlock(BB, BFI)) ||
600  (EnableStaticAnalyis && unlikelyExecuted(*BB));
601  if (!Cold)
602  continue;
603 
604  LLVM_DEBUG({
605  dbgs() << "Found a cold block:\n";
606  BB->dump();
607  });
608 
609  if (!DT)
610  DT = std::make_unique<DominatorTree>(F);
611  if (!PDT)
612  PDT = std::make_unique<PostDominatorTree>(F);
613 
614  auto Regions = OutliningRegion::create(*BB, *DT, *PDT);
615  for (OutliningRegion &Region : Regions) {
616  if (Region.empty())
617  continue;
618 
619  if (Region.isEntireFunctionCold()) {
620  LLVM_DEBUG(dbgs() << "Entire function is cold\n");
621  return markFunctionCold(F);
622  }
623 
624  // If this outlining region intersects with another, drop the new region.
625  //
626  // TODO: It's theoretically possible to outline more by only keeping the
627  // largest region which contains a block, but the extra bookkeeping to do
628  // this is tricky/expensive.
629  bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
630  return !ColdBlocks.insert(Block.first).second;
631  });
632  if (RegionsOverlap)
633  continue;
634 
635  OutliningWorklist.emplace_back(std::move(Region));
636  ++NumColdRegionsFound;
637  }
638  }
639 
640  // Outline single-entry cold regions, splitting up larger regions as needed.
641  unsigned OutlinedFunctionID = 1;
642  while (!OutliningWorklist.empty()) {
643  OutliningRegion Region = OutliningWorklist.pop_back_val();
644  assert(!Region.empty() && "Empty outlining region in worklist");
645  do {
646  BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
647  LLVM_DEBUG({
648  dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
649  for (BasicBlock *BB : SubRegion)
650  BB->dump();
651  });
652 
653  Function *Outlined = extractColdRegion(SubRegion, *DT, BFI, TTI, ORE, AC,
654  OutlinedFunctionID);
655  if (Outlined) {
656  ++OutlinedFunctionID;
657  Changed = true;
658  }
659  } while (!Region.empty());
660  }
661 
662  return Changed;
663 }
664 
665 bool HotColdSplitting::run(Module &M) {
666  bool Changed = false;
667  bool HasProfileSummary = (M.getProfileSummary(/* IsCS */ false) != nullptr);
668  for (auto It = M.begin(), End = M.end(); It != End; ++It) {
669  Function &F = *It;
670 
671  // Do not touch declarations.
672  if (F.isDeclaration())
673  continue;
674 
675  // Do not modify `optnone` functions.
676  if (F.hasOptNone())
677  continue;
678 
679  // Detect inherently cold functions and mark them as such.
680  if (isFunctionCold(F)) {
681  Changed |= markFunctionCold(F);
682  continue;
683  }
684 
685  if (!shouldOutlineFrom(F)) {
686  LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n");
687  continue;
688  }
689 
690  LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
691  Changed |= outlineColdRegions(F, HasProfileSummary);
692  }
693  return Changed;
694 }
695 
696 bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
697  if (skipModule(M))
698  return false;
699  ProfileSummaryInfo *PSI =
700  &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
701  auto GTTI = [this](Function &F) -> TargetTransformInfo & {
702  return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
703  };
704  auto GBFI = [this](Function &F) {
705  return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
706  };
707  std::unique_ptr<OptimizationRemarkEmitter> ORE;
708  std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
709  [&ORE](Function &F) -> OptimizationRemarkEmitter & {
710  ORE.reset(new OptimizationRemarkEmitter(&F));
711  return *ORE.get();
712  };
713  auto LookupAC = [this](Function &F) -> AssumptionCache * {
714  if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
715  return ACT->lookupAssumptionCache(F);
716  return nullptr;
717  };
718 
719  return HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M);
720 }
721 
724  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
725 
726  auto LookupAC = [&FAM](Function &F) -> AssumptionCache * {
727  return FAM.getCachedResult<AssumptionAnalysis>(F);
728  };
729 
730  auto GBFI = [&FAM](Function &F) {
731  return &FAM.getResult<BlockFrequencyAnalysis>(F);
732  };
733 
734  std::function<TargetTransformInfo &(Function &)> GTTI =
735  [&FAM](Function &F) -> TargetTransformInfo & {
736  return FAM.getResult<TargetIRAnalysis>(F);
737  };
738 
739  std::unique_ptr<OptimizationRemarkEmitter> ORE;
740  std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
741  [&ORE](Function &F) -> OptimizationRemarkEmitter & {
742  ORE.reset(new OptimizationRemarkEmitter(&F));
743  return *ORE.get();
744  };
745 
747 
748  if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M))
749  return PreservedAnalyses::none();
750  return PreservedAnalyses::all();
751 }
752 
754 INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit",
755  "Hot Cold Splitting", false, false)
758 INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit",
759  "Hot Cold Splitting", false, false)
760 
762  return new HotColdSplittingLegacyPass();
763 }
INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit", "Hot Cold Splitting", false, false) INITIALIZE_PASS_END(HotColdSplittingLegacyPass
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:52
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:641
Diagnostic information for missed-optimization remarks.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
DiagnosticInfoOptimizationBase::Argument NV
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:776
This class represents lattice values for constants.
Definition: AllocatorList.h:23
bool hasOptNone() const
Do not optimize this function (-O0).
Definition: Function.h:616
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:65
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
Hot Cold Splitting
bool useColdCCForColdCall(Function &F) const
Return true if the input function which is cold at all call sites, should use coldcc calling conventi...
Analysis providing profile information.
This class represents a function call, abstracting a target machine&#39;s calling convention.
static cl::opt< bool > EnableStaticAnalyis("hot-cold-static-analysis", cl::init(true), cl::Hidden)
This file contains the declarations for metadata subclasses.
An immutable pass that tracks lazily created AssumptionCache objects.
Metadata * getProfileSummary(bool IsCS)
Returns profile summary metadata.
Definition: Module.cpp:541
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLExtras.h:104
A cache of @llvm.assume calls within a function.
Analysis pass providing the TargetTransformInfo.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:323
F(f)
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:144
This defines the Use class.
int getInstructionCost(const Instruction *I, enum TargetCostKind kind) const
Query the cost of a specified instruction.
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4428
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
void setEntryCount(ProfileCount Count, const DenseSet< GlobalValue::GUID > *Imports=nullptr)
Set the entry count for this function.
Definition: Function.cpp:1505
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Legacy analysis pass which computes BlockFrequencyInfo.
bool empty() const
Definition: BasicBlock.h:284
STATISTIC(NumColdRegionsFound, "Number of cold regions found.")
static bool blockEndsInUnreachable(const MachineBasicBlock *MBB)
A no successor, non-return block probably ends in unreachable and is cold.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:156
idf_iterator< T > idf_begin(const T &G)
idf_iterator< T > idf_end(const T &G)
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:153
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
df_iterator< T > df_end(const T &G)
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug() const
Return a const iterator range over the instructions in the block, skipping any debug instructions...
Definition: BasicBlock.cpp:94
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:91
void setIsNoInline()
Definition: InstrTypes.h:1621
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
Diagnostic information for applied optimization remarks.
Represent the analysis usage information of a pass.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1172
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
bool pred_empty(const BasicBlock *BB)
Definition: CFG.h:116
auto remove_if(R &&Range, UnaryPredicate P) -> decltype(adl_begin(Range))
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1205
bool succ_empty(const Instruction *I)
Definition: CFG.h:253
iterator erase(const_iterator CI)
Definition: SmallVector.h:434
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:159
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1186
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:197
size_type size() const
Definition: SmallPtrSet.h:92
A function analysis which provides an AssumptionCache.
bool hasAddressTaken() const
Returns true if there are any uses of this basic block other than direct branches, switches, etc.
Definition: BasicBlock.h:396
Analysis pass which computes BlockFrequencyInfo.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
std::pair< BasicBlock *, unsigned > BlockTy
A pair of (basic block, score).
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file. ...
void initializeHotColdSplittingLegacyPassPass(PassRegistry &)
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static cl::opt< int > SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden, cl::desc("Base penalty for splitting cold code (as a " "multiple of TCC_Basic)"))
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:248
Module.h This file contains the declarations for the Module class.
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:374
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
static int getOutliningBenefit(ArrayRef< BasicBlock *> Region, TargetTransformInfo &TTI)
Get the benefit score of outlining Region.
df_iterator< T > df_begin(const T &G)
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
iterator end()
Definition: Module.h:600
ModulePass * createHotColdSplittingPass()
createHotColdSplittingPass - This pass outlines cold blocks into a separate function(s).
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
static int getOutliningPenalty(ArrayRef< BasicBlock *> Region, unsigned NumInputs, unsigned NumOutputs)
Get the penalty score for outlining Region.
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
#define I(x, y, z)
Definition: MD5.cpp:58
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:224
iterator begin()
Definition: Module.h:598
const std::string to_string(const T &Value)
Definition: ScopedPrinter.h:61
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:231
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
user_iterator user_begin()
Definition: Value.h:395
The cost of a typical &#39;add&#39; instruction.
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:411
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
#define DEBUG_TYPE
succ_range successors(Instruction *I)
Definition: CFG.h:259
hotcoldsplit
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.h:229
print Print MemDeps of function
A container for analyses that lazily runs them and caches their results.
This pass exposes codegen information to IR-level passes.
This header defines various interfaces for pass management in LLVM.
#define LLVM_DEBUG(X)
Definition: Debug.h:122
The optimization diagnostic interface.
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:1044