LLVM  8.0.0svn
MachineBlockPlacement.cpp
Go to the documentation of this file.
1 //===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements basic block placement transformations using the CFG
11 // structure and branch probability estimates.
12 //
13 // The pass strives to preserve the structure of the CFG (that is, retain
14 // a topological ordering of basic blocks) in the absence of a *strong* signal
15 // to the contrary from probabilities. However, within the CFG structure, it
16 // attempts to choose an ordering which favors placing more likely sequences of
17 // blocks adjacent to each other.
18 //
19 // The algorithm works from the inner-most loop within a function outward, and
20 // at each stage walks through the basic blocks, trying to coalesce them into
21 // sequential chains where allowed by the CFG (or demanded by heavy
22 // probabilities). Finally, it walks the blocks in topological order, and the
23 // first time it reaches a chain of basic blocks, it schedules them in the
24 // function in-order.
25 //
26 //===----------------------------------------------------------------------===//
27 
28 #include "BranchFolding.h"
29 #include "llvm/ADT/ArrayRef.h"
30 #include "llvm/ADT/DenseMap.h"
31 #include "llvm/ADT/STLExtras.h"
32 #include "llvm/ADT/SetVector.h"
33 #include "llvm/ADT/SmallPtrSet.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/Statistic.h"
50 #include "llvm/IR/DebugLoc.h"
51 #include "llvm/IR/Function.h"
52 #include "llvm/Pass.h"
53 #include "llvm/Support/Allocator.h"
56 #include "llvm/Support/CodeGen.h"
58 #include "llvm/Support/Compiler.h"
59 #include "llvm/Support/Debug.h"
62 #include <algorithm>
63 #include <cassert>
64 #include <cstdint>
65 #include <iterator>
66 #include <memory>
67 #include <string>
68 #include <tuple>
69 #include <utility>
70 #include <vector>
71 
72 using namespace llvm;
73 
74 #define DEBUG_TYPE "block-placement"
75 
76 STATISTIC(NumCondBranches, "Number of conditional branches");
77 STATISTIC(NumUncondBranches, "Number of unconditional branches");
78 STATISTIC(CondBranchTakenFreq,
79  "Potential frequency of taking conditional branches");
80 STATISTIC(UncondBranchTakenFreq,
81  "Potential frequency of taking unconditional branches");
82 
83 static cl::opt<unsigned> AlignAllBlock("align-all-blocks",
84  cl::desc("Force the alignment of all "
85  "blocks in the function."),
86  cl::init(0), cl::Hidden);
87 
89  "align-all-nofallthru-blocks",
90  cl::desc("Force the alignment of all "
91  "blocks that have no fall-through predecessors (i.e. don't add "
92  "nops that are executed)."),
93  cl::init(0), cl::Hidden);
94 
95 // FIXME: Find a good default for this flag and remove the flag.
97  "block-placement-exit-block-bias",
98  cl::desc("Block frequency percentage a loop exit block needs "
99  "over the original exit to be considered the new exit."),
100  cl::init(0), cl::Hidden);
101 
102 // Definition:
103 // - Outlining: placement of a basic block outside the chain or hot path.
104 
106  "loop-to-cold-block-ratio",
107  cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
108  "(frequency of block) is greater than this ratio"),
109  cl::init(5), cl::Hidden);
110 
112  "force-loop-cold-block",
113  cl::desc("Force outlining cold blocks from loops."),
114  cl::init(false), cl::Hidden);
115 
116 static cl::opt<bool>
117  PreciseRotationCost("precise-rotation-cost",
118  cl::desc("Model the cost of loop rotation more "
119  "precisely by using profile data."),
120  cl::init(false), cl::Hidden);
121 
122 static cl::opt<bool>
123  ForcePreciseRotationCost("force-precise-rotation-cost",
124  cl::desc("Force the use of precise cost "
125  "loop rotation strategy."),
126  cl::init(false), cl::Hidden);
127 
129  "misfetch-cost",
130  cl::desc("Cost that models the probabilistic risk of an instruction "
131  "misfetch due to a jump comparing to falling through, whose cost "
132  "is zero."),
133  cl::init(1), cl::Hidden);
134 
135 static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
136  cl::desc("Cost of jump instructions."),
137  cl::init(1), cl::Hidden);
138 static cl::opt<bool>
139 TailDupPlacement("tail-dup-placement",
140  cl::desc("Perform tail duplication during placement. "
141  "Creates more fallthrough opportunites in "
142  "outline branches."),
143  cl::init(true), cl::Hidden);
144 
145 static cl::opt<bool>
146 BranchFoldPlacement("branch-fold-placement",
147  cl::desc("Perform branch folding during placement. "
148  "Reduces code size."),
149  cl::init(true), cl::Hidden);
150 
151 // Heuristic for tail duplication.
153  "tail-dup-placement-threshold",
154  cl::desc("Instruction cutoff for tail duplication during layout. "
155  "Tail merging during layout is forced to have a threshold "
156  "that won't conflict."), cl::init(2),
157  cl::Hidden);
158 
159 // Heuristic for aggressive tail duplication.
161  "tail-dup-placement-aggressive-threshold",
162  cl::desc("Instruction cutoff for aggressive tail duplication during "
163  "layout. Used at -O3. Tail merging during layout is forced to "
164  "have a threshold that won't conflict."), cl::init(4),
165  cl::Hidden);
166 
167 // Heuristic for tail duplication.
169  "tail-dup-placement-penalty",
170  cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
171  "Copying can increase fallthrough, but it also increases icache "
172  "pressure. This parameter controls the penalty to account for that. "
173  "Percent as integer."),
174  cl::init(2),
175  cl::Hidden);
176 
177 // Heuristic for triangle chains.
179  "triangle-chain-count",
180  cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
181  "triangle tail duplication heuristic to kick in. 0 to disable."),
182  cl::init(2),
183  cl::Hidden);
184 
187 
188 // Internal option used to control BFI display only after MBP pass.
189 // Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
190 // -view-block-layout-with-bfi=
192 
193 // Command line option to specify the name of the function for CFG dump
194 // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
196 
197 namespace {
198 
199 class BlockChain;
200 
201 /// Type for our function-wide basic block -> block chain mapping.
202 using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
203 
204 /// A chain of blocks which will be laid out contiguously.
205 ///
206 /// This is the datastructure representing a chain of consecutive blocks that
207 /// are profitable to layout together in order to maximize fallthrough
208 /// probabilities and code locality. We also can use a block chain to represent
209 /// a sequence of basic blocks which have some external (correctness)
210 /// requirement for sequential layout.
211 ///
212 /// Chains can be built around a single basic block and can be merged to grow
213 /// them. They participate in a block-to-chain mapping, which is updated
214 /// automatically as chains are merged together.
215 class BlockChain {
216  /// The sequence of blocks belonging to this chain.
217  ///
218  /// This is the sequence of blocks for a particular chain. These will be laid
219  /// out in-order within the function.
221 
222  /// A handle to the function-wide basic block to block chain mapping.
223  ///
224  /// This is retained in each block chain to simplify the computation of child
225  /// block chains for SCC-formation and iteration. We store the edges to child
226  /// basic blocks, and map them back to their associated chains using this
227  /// structure.
228  BlockToChainMapType &BlockToChain;
229 
230 public:
231  /// Construct a new BlockChain.
232  ///
233  /// This builds a new block chain representing a single basic block in the
234  /// function. It also registers itself as the chain that block participates
235  /// in with the BlockToChain mapping.
236  BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
237  : Blocks(1, BB), BlockToChain(BlockToChain) {
238  assert(BB && "Cannot create a chain with a null basic block");
239  BlockToChain[BB] = this;
240  }
241 
242  /// Iterator over blocks within the chain.
245 
246  /// Beginning of blocks within the chain.
247  iterator begin() { return Blocks.begin(); }
248  const_iterator begin() const { return Blocks.begin(); }
249 
250  /// End of blocks within the chain.
251  iterator end() { return Blocks.end(); }
252  const_iterator end() const { return Blocks.end(); }
253 
254  bool remove(MachineBasicBlock* BB) {
255  for(iterator i = begin(); i != end(); ++i) {
256  if (*i == BB) {
257  Blocks.erase(i);
258  return true;
259  }
260  }
261  return false;
262  }
263 
264  /// Merge a block chain into this one.
265  ///
266  /// This routine merges a block chain into this one. It takes care of forming
267  /// a contiguous sequence of basic blocks, updating the edge list, and
268  /// updating the block -> chain mapping. It does not free or tear down the
269  /// old chain, but the old chain's block list is no longer valid.
270  void merge(MachineBasicBlock *BB, BlockChain *Chain) {
271  assert(BB && "Can't merge a null block.");
272  assert(!Blocks.empty() && "Can't merge into an empty chain.");
273 
274  // Fast path in case we don't have a chain already.
275  if (!Chain) {
276  assert(!BlockToChain[BB] &&
277  "Passed chain is null, but BB has entry in BlockToChain.");
278  Blocks.push_back(BB);
279  BlockToChain[BB] = this;
280  return;
281  }
282 
283  assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
284  assert(Chain->begin() != Chain->end());
285 
286  // Update the incoming blocks to point to this chain, and add them to the
287  // chain structure.
288  for (MachineBasicBlock *ChainBB : *Chain) {
289  Blocks.push_back(ChainBB);
290  assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
291  BlockToChain[ChainBB] = this;
292  }
293  }
294 
295 #ifndef NDEBUG
296  /// Dump the blocks in this chain.
297  LLVM_DUMP_METHOD void dump() {
298  for (MachineBasicBlock *MBB : *this)
299  MBB->dump();
300  }
301 #endif // NDEBUG
302 
303  /// Count of predecessors of any block within the chain which have not
304  /// yet been scheduled. In general, we will delay scheduling this chain
305  /// until those predecessors are scheduled (or we find a sufficiently good
306  /// reason to override this heuristic.) Note that when forming loop chains,
307  /// blocks outside the loop are ignored and treated as if they were already
308  /// scheduled.
309  ///
310  /// Note: This field is reinitialized multiple times - once for each loop,
311  /// and then once for the function as a whole.
312  unsigned UnscheduledPredecessors = 0;
313 };
314 
315 class MachineBlockPlacement : public MachineFunctionPass {
316  /// A type for a block filter set.
317  using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
318 
319  /// Pair struct containing basic block and taildup profitiability
320  struct BlockAndTailDupResult {
321  MachineBasicBlock *BB;
322  bool ShouldTailDup;
323  };
324 
325  /// Triple struct containing edge weight and the edge.
326  struct WeightedEdge {
327  BlockFrequency Weight;
328  MachineBasicBlock *Src;
329  MachineBasicBlock *Dest;
330  };
331 
332  /// work lists of blocks that are ready to be laid out
335 
336  /// Edges that have already been computed as optimal.
338 
339  /// Machine Function
341 
342  /// A handle to the branch probability pass.
343  const MachineBranchProbabilityInfo *MBPI;
344 
345  /// A handle to the function-wide block frequency pass.
346  std::unique_ptr<BranchFolder::MBFIWrapper> MBFI;
347 
348  /// A handle to the loop info.
349  MachineLoopInfo *MLI;
350 
351  /// Preferred loop exit.
352  /// Member variable for convenience. It may be removed by duplication deep
353  /// in the call stack.
354  MachineBasicBlock *PreferredLoopExit;
355 
356  /// A handle to the target's instruction info.
357  const TargetInstrInfo *TII;
358 
359  /// A handle to the target's lowering info.
360  const TargetLoweringBase *TLI;
361 
362  /// A handle to the post dominator tree.
364 
365  /// Duplicator used to duplicate tails during placement.
366  ///
367  /// Placement decisions can open up new tail duplication opportunities, but
368  /// since tail duplication affects placement decisions of later blocks, it
369  /// must be done inline.
370  TailDuplicator TailDup;
371 
372  /// Allocator and owner of BlockChain structures.
373  ///
374  /// We build BlockChains lazily while processing the loop structure of
375  /// a function. To reduce malloc traffic, we allocate them using this
376  /// slab-like allocator, and destroy them after the pass completes. An
377  /// important guarantee is that this allocator produces stable pointers to
378  /// the chains.
380 
381  /// Function wide BasicBlock to BlockChain mapping.
382  ///
383  /// This mapping allows efficiently moving from any given basic block to the
384  /// BlockChain it participates in, if any. We use it to, among other things,
385  /// allow implicitly defining edges between chains as the existing edges
386  /// between basic blocks.
388 
389 #ifndef NDEBUG
390  /// The set of basic blocks that have terminators that cannot be fully
391  /// analyzed. These basic blocks cannot be re-ordered safely by
392  /// MachineBlockPlacement, and we must preserve physical layout of these
393  /// blocks and their successors through the pass.
394  SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
395 #endif
396 
397  /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
398  /// if the count goes to 0, add them to the appropriate work list.
399  void markChainSuccessors(
400  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
401  const BlockFilterSet *BlockFilter = nullptr);
402 
403  /// Decrease the UnscheduledPredecessors count for a single block, and
404  /// if the count goes to 0, add them to the appropriate work list.
405  void markBlockSuccessors(
406  const BlockChain &Chain, const MachineBasicBlock *BB,
407  const MachineBasicBlock *LoopHeaderBB,
408  const BlockFilterSet *BlockFilter = nullptr);
409 
411  collectViableSuccessors(
412  const MachineBasicBlock *BB, const BlockChain &Chain,
413  const BlockFilterSet *BlockFilter,
415  bool shouldPredBlockBeOutlined(
416  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
417  const BlockChain &Chain, const BlockFilterSet *BlockFilter,
418  BranchProbability SuccProb, BranchProbability HotProb);
419  bool repeatedlyTailDuplicateBlock(
421  const MachineBasicBlock *LoopHeaderBB,
422  BlockChain &Chain, BlockFilterSet *BlockFilter,
423  MachineFunction::iterator &PrevUnplacedBlockIt);
424  bool maybeTailDuplicateBlock(
426  BlockChain &Chain, BlockFilterSet *BlockFilter,
427  MachineFunction::iterator &PrevUnplacedBlockIt,
428  bool &DuplicatedToLPred);
429  bool hasBetterLayoutPredecessor(
430  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
431  const BlockChain &SuccChain, BranchProbability SuccProb,
432  BranchProbability RealSuccProb, const BlockChain &Chain,
433  const BlockFilterSet *BlockFilter);
434  BlockAndTailDupResult selectBestSuccessor(
435  const MachineBasicBlock *BB, const BlockChain &Chain,
436  const BlockFilterSet *BlockFilter);
437  MachineBasicBlock *selectBestCandidateBlock(
438  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
439  MachineBasicBlock *getFirstUnplacedBlock(
440  const BlockChain &PlacedChain,
441  MachineFunction::iterator &PrevUnplacedBlockIt,
442  const BlockFilterSet *BlockFilter);
443 
444  /// Add a basic block to the work list if it is appropriate.
445  ///
446  /// If the optional parameter BlockFilter is provided, only MBB
447  /// present in the set will be added to the worklist. If nullptr
448  /// is provided, no filtering occurs.
449  void fillWorkLists(const MachineBasicBlock *MBB,
450  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
451  const BlockFilterSet *BlockFilter);
452 
453  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
454  BlockFilterSet *BlockFilter = nullptr);
455  MachineBasicBlock *findBestLoopTop(
456  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
457  MachineBasicBlock *findBestLoopExit(
458  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
459  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
460  void buildLoopChains(const MachineLoop &L);
461  void rotateLoop(
462  BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
463  const BlockFilterSet &LoopBlockSet);
464  void rotateLoopWithProfile(
465  BlockChain &LoopChain, const MachineLoop &L,
466  const BlockFilterSet &LoopBlockSet);
467  void buildCFGChains();
468  void optimizeBranches();
469  void alignBlocks();
470  /// Returns true if a block should be tail-duplicated to increase fallthrough
471  /// opportunities.
472  bool shouldTailDuplicate(MachineBasicBlock *BB);
473  /// Check the edge frequencies to see if tail duplication will increase
474  /// fallthroughs.
475  bool isProfitableToTailDup(
476  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
477  BranchProbability QProb,
478  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
479 
480  /// Check for a trellis layout.
481  bool isTrellis(const MachineBasicBlock *BB,
482  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
483  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
484 
485  /// Get the best successor given a trellis layout.
486  BlockAndTailDupResult getBestTrellisSuccessor(
487  const MachineBasicBlock *BB,
488  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
489  BranchProbability AdjustedSumProb, const BlockChain &Chain,
490  const BlockFilterSet *BlockFilter);
491 
492  /// Get the best pair of non-conflicting edges.
493  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
494  const MachineBasicBlock *BB,
496 
497  /// Returns true if a block can tail duplicate into all unplaced
498  /// predecessors. Filters based on loop.
499  bool canTailDuplicateUnplacedPreds(
500  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
501  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
502 
503  /// Find chains of triangles to tail-duplicate where a global analysis works,
504  /// but a local analysis would not find them.
505  void precomputeTriangleChains();
506 
507 public:
508  static char ID; // Pass identification, replacement for typeid
509 
510  MachineBlockPlacement() : MachineFunctionPass(ID) {
512  }
513 
514  bool runOnMachineFunction(MachineFunction &F) override;
515 
516  bool allowTailDupPlacement() const {
517  assert(F);
519  }
520 
521  void getAnalysisUsage(AnalysisUsage &AU) const override {
524  if (TailDupPlacement)
529  }
530 };
531 
532 } // end anonymous namespace
533 
535 
537 
538 INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
539  "Branch Probability Basic Block Placement", false, false)
544 INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
545  "Branch Probability Basic Block Placement", false, false)
546 
547 #ifndef NDEBUG
548 /// Helper to print the name of a MBB.
549 ///
550 /// Only used by debug logging.
551 static std::string getBlockName(const MachineBasicBlock *BB) {
552  std::string Result;
553  raw_string_ostream OS(Result);
554  OS << printMBBReference(*BB);
555  OS << " ('" << BB->getName() << "')";
556  OS.flush();
557  return Result;
558 }
559 #endif
560 
561 /// Mark a chain's successors as having one fewer preds.
562 ///
563 /// When a chain is being merged into the "placed" chain, this routine will
564 /// quickly walk the successors of each block in the chain and mark them as
565 /// having one fewer active predecessor. It also adds any successors of this
566 /// chain which reach the zero-predecessor state to the appropriate worklist.
567 void MachineBlockPlacement::markChainSuccessors(
568  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
569  const BlockFilterSet *BlockFilter) {
570  // Walk all the blocks in this chain, marking their successors as having
571  // a predecessor placed.
572  for (MachineBasicBlock *MBB : Chain) {
573  markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
574  }
575 }
576 
577 /// Mark a single block's successors as having one fewer preds.
578 ///
579 /// Under normal circumstances, this is only called by markChainSuccessors,
580 /// but if a block that was to be placed is completely tail-duplicated away,
581 /// and was duplicated into the chain end, we need to redo markBlockSuccessors
582 /// for just that block.
583 void MachineBlockPlacement::markBlockSuccessors(
584  const BlockChain &Chain, const MachineBasicBlock *MBB,
585  const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
586  // Add any successors for which this is the only un-placed in-loop
587  // predecessor to the worklist as a viable candidate for CFG-neutral
588  // placement. No subsequent placement of this block will violate the CFG
589  // shape, so we get to use heuristics to choose a favorable placement.
590  for (MachineBasicBlock *Succ : MBB->successors()) {
591  if (BlockFilter && !BlockFilter->count(Succ))
592  continue;
593  BlockChain &SuccChain = *BlockToChain[Succ];
594  // Disregard edges within a fixed chain, or edges to the loop header.
595  if (&Chain == &SuccChain || Succ == LoopHeaderBB)
596  continue;
597 
598  // This is a cross-chain edge that is within the loop, so decrement the
599  // loop predecessor count of the destination chain.
600  if (SuccChain.UnscheduledPredecessors == 0 ||
601  --SuccChain.UnscheduledPredecessors > 0)
602  continue;
603 
604  auto *NewBB = *SuccChain.begin();
605  if (NewBB->isEHPad())
606  EHPadWorkList.push_back(NewBB);
607  else
608  BlockWorkList.push_back(NewBB);
609  }
610 }
611 
612 /// This helper function collects the set of successors of block
613 /// \p BB that are allowed to be its layout successors, and return
614 /// the total branch probability of edges from \p BB to those
615 /// blocks.
616 BranchProbability MachineBlockPlacement::collectViableSuccessors(
617  const MachineBasicBlock *BB, const BlockChain &Chain,
618  const BlockFilterSet *BlockFilter,
620  // Adjust edge probabilities by excluding edges pointing to blocks that is
621  // either not in BlockFilter or is already in the current chain. Consider the
622  // following CFG:
623  //
624  // --->A
625  // | / \
626  // | B C
627  // | \ / \
628  // ----D E
629  //
630  // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
631  // A->C is chosen as a fall-through, D won't be selected as a successor of C
632  // due to CFG constraint (the probability of C->D is not greater than
633  // HotProb to break topo-order). If we exclude E that is not in BlockFilter
634  // when calculating the probability of C->D, D will be selected and we
635  // will get A C D B as the layout of this loop.
636  auto AdjustedSumProb = BranchProbability::getOne();
637  for (MachineBasicBlock *Succ : BB->successors()) {
638  bool SkipSucc = false;
639  if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
640  SkipSucc = true;
641  } else {
642  BlockChain *SuccChain = BlockToChain[Succ];
643  if (SuccChain == &Chain) {
644  SkipSucc = true;
645  } else if (Succ != *SuccChain->begin()) {
646  LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
647  << " -> Mid chain!\n");
648  continue;
649  }
650  }
651  if (SkipSucc)
652  AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
653  else
654  Successors.push_back(Succ);
655  }
656 
657  return AdjustedSumProb;
658 }
659 
660 /// The helper function returns the branch probability that is adjusted
661 /// or normalized over the new total \p AdjustedSumProb.
662 static BranchProbability
664  BranchProbability AdjustedSumProb) {
665  BranchProbability SuccProb;
666  uint32_t SuccProbN = OrigProb.getNumerator();
667  uint32_t SuccProbD = AdjustedSumProb.getNumerator();
668  if (SuccProbN >= SuccProbD)
669  SuccProb = BranchProbability::getOne();
670  else
671  SuccProb = BranchProbability(SuccProbN, SuccProbD);
672 
673  return SuccProb;
674 }
675 
676 /// Check if \p BB has exactly the successors in \p Successors.
677 static bool
680  if (BB.succ_size() != Successors.size())
681  return false;
682  // We don't want to count self-loops
683  if (Successors.count(&BB))
684  return false;
685  for (MachineBasicBlock *Succ : BB.successors())
686  if (!Successors.count(Succ))
687  return false;
688  return true;
689 }
690 
691 /// Check if a block should be tail duplicated to increase fallthrough
692 /// opportunities.
693 /// \p BB Block to check.
694 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
695  // Blocks with single successors don't create additional fallthrough
696  // opportunities. Don't duplicate them. TODO: When conditional exits are
697  // analyzable, allow them to be duplicated.
698  bool IsSimple = TailDup.isSimpleBB(BB);
699 
700  if (BB->succ_size() == 1)
701  return false;
702  return TailDup.shouldTailDuplicate(IsSimple, *BB);
703 }
704 
705 /// Compare 2 BlockFrequency's with a small penalty for \p A.
706 /// In order to be conservative, we apply a X% penalty to account for
707 /// increased icache pressure and static heuristics. For small frequencies
708 /// we use only the numerators to improve accuracy. For simplicity, we assume the
709 /// penalty is less than 100%
710 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
712  uint64_t EntryFreq) {
713  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
714  BlockFrequency Gain = A - B;
715  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
716 }
717 
718 /// Check the edge frequencies to see if tail duplication will increase
719 /// fallthroughs. It only makes sense to call this function when
720 /// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
721 /// always locally profitable if we would have picked \p Succ without
722 /// considering duplication.
723 bool MachineBlockPlacement::isProfitableToTailDup(
724  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
725  BranchProbability QProb,
726  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
727  // We need to do a probability calculation to make sure this is profitable.
728  // First: does succ have a successor that post-dominates? This affects the
729  // calculation. The 2 relevant cases are:
730  // BB BB
731  // | \Qout | \Qout
732  // P| C |P C
733  // = C' = C'
734  // | /Qin | /Qin
735  // | / | /
736  // Succ Succ
737  // / \ | \ V
738  // U/ =V |U \
739  // / \ = D
740  // D E | /
741  // | /
742  // |/
743  // PDom
744  // '=' : Branch taken for that CFG edge
745  // In the second case, Placing Succ while duplicating it into C prevents the
746  // fallthrough of Succ into either D or PDom, because they now have C as an
747  // unplaced predecessor
748 
749  // Start by figuring out which case we fall into
750  MachineBasicBlock *PDom = nullptr;
752  // Only scan the relevant successors
753  auto AdjustedSuccSumProb =
754  collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
755  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
756  auto BBFreq = MBFI->getBlockFreq(BB);
757  auto SuccFreq = MBFI->getBlockFreq(Succ);
758  BlockFrequency P = BBFreq * PProb;
759  BlockFrequency Qout = BBFreq * QProb;
760  uint64_t EntryFreq = MBFI->getEntryFreq();
761  // If there are no more successors, it is profitable to copy, as it strictly
762  // increases fallthrough.
763  if (SuccSuccs.size() == 0)
764  return greaterWithBias(P, Qout, EntryFreq);
765 
766  auto BestSuccSucc = BranchProbability::getZero();
767  // Find the PDom or the best Succ if no PDom exists.
768  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
769  auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
770  if (Prob > BestSuccSucc)
771  BestSuccSucc = Prob;
772  if (PDom == nullptr)
773  if (MPDT->dominates(SuccSucc, Succ)) {
774  PDom = SuccSucc;
775  break;
776  }
777  }
778  // For the comparisons, we need to know Succ's best incoming edge that isn't
779  // from BB.
780  auto SuccBestPred = BlockFrequency(0);
781  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
782  if (SuccPred == Succ || SuccPred == BB
783  || BlockToChain[SuccPred] == &Chain
784  || (BlockFilter && !BlockFilter->count(SuccPred)))
785  continue;
786  auto Freq = MBFI->getBlockFreq(SuccPred)
787  * MBPI->getEdgeProbability(SuccPred, Succ);
788  if (Freq > SuccBestPred)
789  SuccBestPred = Freq;
790  }
791  // Qin is Succ's best unplaced incoming edge that isn't BB
792  BlockFrequency Qin = SuccBestPred;
793  // If it doesn't have a post-dominating successor, here is the calculation:
794  // BB BB
795  // | \Qout | \
796  // P| C | =
797  // = C' | C
798  // | /Qin | |
799  // | / | C' (+Succ)
800  // Succ Succ /|
801  // / \ | \/ |
802  // U/ =V | == |
803  // / \ | / \|
804  // D E D E
805  // '=' : Branch taken for that CFG edge
806  // Cost in the first case is: P + V
807  // For this calculation, we always assume P > Qout. If Qout > P
808  // The result of this function will be ignored at the caller.
809  // Let F = SuccFreq - Qin
810  // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
811 
812  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
813  BranchProbability UProb = BestSuccSucc;
814  BranchProbability VProb = AdjustedSuccSumProb - UProb;
815  BlockFrequency F = SuccFreq - Qin;
816  BlockFrequency V = SuccFreq * VProb;
817  BlockFrequency QinU = std::min(Qin, F) * UProb;
818  BlockFrequency BaseCost = P + V;
819  BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
820  return greaterWithBias(BaseCost, DupCost, EntryFreq);
821  }
822  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
823  BranchProbability VProb = AdjustedSuccSumProb - UProb;
824  BlockFrequency U = SuccFreq * UProb;
825  BlockFrequency V = SuccFreq * VProb;
826  BlockFrequency F = SuccFreq - Qin;
827  // If there is a post-dominating successor, here is the calculation:
828  // BB BB BB BB
829  // | \Qout | \ | \Qout | \
830  // |P C | = |P C | =
831  // = C' |P C = C' |P C
832  // | /Qin | | | /Qin | |
833  // | / | C' (+Succ) | / | C' (+Succ)
834  // Succ Succ /| Succ Succ /|
835  // | \ V | \/ | | \ V | \/ |
836  // |U \ |U /\ =? |U = |U /\ |
837  // = D = = =?| | D | = =|
838  // | / |/ D | / |/ D
839  // | / | / | = | /
840  // |/ | / |/ | =
841  // Dom Dom Dom Dom
842  // '=' : Branch taken for that CFG edge
843  // The cost for taken branches in the first case is P + U
844  // Let F = SuccFreq - Qin
845  // The cost in the second case (assuming independence), given the layout:
846  // BB, Succ, (C+Succ), D, Dom or the layout:
847  // BB, Succ, D, Dom, (C+Succ)
848  // is Qout + max(F, Qin) * U + min(F, Qin)
849  // compare P + U vs Qout + P * U + Qin.
850  //
851  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
852  //
853  // For the 3rd case, the cost is P + 2 * V
854  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
855  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
856  if (UProb > AdjustedSuccSumProb / 2 &&
857  !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
858  Chain, BlockFilter))
859  // Cases 3 & 4
860  return greaterWithBias(
861  (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
862  EntryFreq);
863  // Cases 1 & 2
864  return greaterWithBias((P + U),
865  (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
866  std::max(Qin, F) * UProb),
867  EntryFreq);
868 }
869 
870 /// Check for a trellis layout. \p BB is the upper part of a trellis if its
871 /// successors form the lower part of a trellis. A successor set S forms the
872 /// lower part of a trellis if all of the predecessors of S are either in S or
873 /// have all of S as successors. We ignore trellises where BB doesn't have 2
874 /// successors because for fewer than 2, it's trivial, and for 3 or greater they
875 /// are very uncommon and complex to compute optimally. Allowing edges within S
876 /// is not strictly a trellis, but the same algorithm works, so we allow it.
877 bool MachineBlockPlacement::isTrellis(
878  const MachineBasicBlock *BB,
879  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
880  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
881  // Technically BB could form a trellis with branching factor higher than 2.
882  // But that's extremely uncommon.
883  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
884  return false;
885 
887  BB->succ_end());
888  // To avoid reviewing the same predecessors twice.
890 
891  for (MachineBasicBlock *Succ : ViableSuccs) {
892  int PredCount = 0;
893  for (auto SuccPred : Succ->predecessors()) {
894  // Allow triangle successors, but don't count them.
895  if (Successors.count(SuccPred)) {
896  // Make sure that it is actually a triangle.
897  for (MachineBasicBlock *CheckSucc : SuccPred->successors())
898  if (!Successors.count(CheckSucc))
899  return false;
900  continue;
901  }
902  const BlockChain *PredChain = BlockToChain[SuccPred];
903  if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
904  PredChain == &Chain || PredChain == BlockToChain[Succ])
905  continue;
906  ++PredCount;
907  // Perform the successor check only once.
908  if (!SeenPreds.insert(SuccPred).second)
909  continue;
910  if (!hasSameSuccessors(*SuccPred, Successors))
911  return false;
912  }
913  // If one of the successors has only BB as a predecessor, it is not a
914  // trellis.
915  if (PredCount < 1)
916  return false;
917  }
918  return true;
919 }
920 
921 /// Pick the highest total weight pair of edges that can both be laid out.
922 /// The edges in \p Edges[0] are assumed to have a different destination than
923 /// the edges in \p Edges[1]. Simple counting shows that the best pair is either
924 /// the individual highest weight edges to the 2 different destinations, or in
925 /// case of a conflict, one of them should be replaced with a 2nd best edge.
926 std::pair<MachineBlockPlacement::WeightedEdge,
927  MachineBlockPlacement::WeightedEdge>
928 MachineBlockPlacement::getBestNonConflictingEdges(
929  const MachineBasicBlock *BB,
931  Edges) {
932  // Sort the edges, and then for each successor, find the best incoming
933  // predecessor. If the best incoming predecessors aren't the same,
934  // then that is clearly the best layout. If there is a conflict, one of the
935  // successors will have to fallthrough from the second best predecessor. We
936  // compare which combination is better overall.
937 
938  // Sort for highest frequency.
939  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
940 
941  std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp);
942  std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp);
943  auto BestA = Edges[0].begin();
944  auto BestB = Edges[1].begin();
945  // Arrange for the correct answer to be in BestA and BestB
946  // If the 2 best edges don't conflict, the answer is already there.
947  if (BestA->Src == BestB->Src) {
948  // Compare the total fallthrough of (Best + Second Best) for both pairs
949  auto SecondBestA = std::next(BestA);
950  auto SecondBestB = std::next(BestB);
951  BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
952  BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
953  if (BestAScore < BestBScore)
954  BestA = SecondBestA;
955  else
956  BestB = SecondBestB;
957  }
958  // Arrange for the BB edge to be in BestA if it exists.
959  if (BestB->Src == BB)
960  std::swap(BestA, BestB);
961  return std::make_pair(*BestA, *BestB);
962 }
963 
964 /// Get the best successor from \p BB based on \p BB being part of a trellis.
965 /// We only handle trellises with 2 successors, so the algorithm is
966 /// straightforward: Find the best pair of edges that don't conflict. We find
967 /// the best incoming edge for each successor in the trellis. If those conflict,
968 /// we consider which of them should be replaced with the second best.
969 /// Upon return the two best edges will be in \p BestEdges. If one of the edges
970 /// comes from \p BB, it will be in \p BestEdges[0]
971 MachineBlockPlacement::BlockAndTailDupResult
972 MachineBlockPlacement::getBestTrellisSuccessor(
973  const MachineBasicBlock *BB,
974  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
975  BranchProbability AdjustedSumProb, const BlockChain &Chain,
976  const BlockFilterSet *BlockFilter) {
977 
978  BlockAndTailDupResult Result = {nullptr, false};
980  BB->succ_end());
981 
982  // We assume size 2 because it's common. For general n, we would have to do
983  // the Hungarian algorithm, but it's not worth the complexity because more
984  // than 2 successors is fairly uncommon, and a trellis even more so.
985  if (Successors.size() != 2 || ViableSuccs.size() != 2)
986  return Result;
987 
988  // Collect the edge frequencies of all edges that form the trellis.
990  int SuccIndex = 0;
991  for (auto Succ : ViableSuccs) {
992  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
993  // Skip any placed predecessors that are not BB
994  if (SuccPred != BB)
995  if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
996  BlockToChain[SuccPred] == &Chain ||
997  BlockToChain[SuccPred] == BlockToChain[Succ])
998  continue;
999  BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1000  MBPI->getEdgeProbability(SuccPred, Succ);
1001  Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1002  }
1003  ++SuccIndex;
1004  }
1005 
1006  // Pick the best combination of 2 edges from all the edges in the trellis.
1007  WeightedEdge BestA, BestB;
1008  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1009 
1010  if (BestA.Src != BB) {
1011  // If we have a trellis, and BB doesn't have the best fallthrough edges,
1012  // we shouldn't choose any successor. We've already looked and there's a
1013  // better fallthrough edge for all the successors.
1014  LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1015  return Result;
1016  }
1017 
1018  // Did we pick the triangle edge? If tail-duplication is profitable, do
1019  // that instead. Otherwise merge the triangle edge now while we know it is
1020  // optimal.
1021  if (BestA.Dest == BestB.Src) {
1022  // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1023  // would be better.
1024  MachineBasicBlock *Succ1 = BestA.Dest;
1025  MachineBasicBlock *Succ2 = BestB.Dest;
1026  // Check to see if tail-duplication would be profitable.
1027  if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
1028  canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1029  isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1030  Chain, BlockFilter)) {
1032  MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1033  dbgs() << " Selected: " << getBlockName(Succ2)
1034  << ", probability: " << Succ2Prob
1035  << " (Tail Duplicate)\n");
1036  Result.BB = Succ2;
1037  Result.ShouldTailDup = true;
1038  return Result;
1039  }
1040  }
1041  // We have already computed the optimal edge for the other side of the
1042  // trellis.
1043  ComputedEdges[BestB.Src] = { BestB.Dest, false };
1044 
1045  auto TrellisSucc = BestA.Dest;
1047  MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1048  dbgs() << " Selected: " << getBlockName(TrellisSucc)
1049  << ", probability: " << SuccProb << " (Trellis)\n");
1050  Result.BB = TrellisSucc;
1051  return Result;
1052 }
1053 
1054 /// When the option allowTailDupPlacement() is on, this method checks if the
1055 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1056 /// into all of its unplaced, unfiltered predecessors, that are not BB.
1057 bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1058  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1059  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1060  if (!shouldTailDuplicate(Succ))
1061  return false;
1062 
1063  // For CFG checking.
1065  BB->succ_end());
1066  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1067  // Make sure all unplaced and unfiltered predecessors can be
1068  // tail-duplicated into.
1069  // Skip any blocks that are already placed or not in this loop.
1070  if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
1071  || BlockToChain[Pred] == &Chain)
1072  continue;
1073  if (!TailDup.canTailDuplicate(Succ, Pred)) {
1074  if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1075  // This will result in a trellis after tail duplication, so we don't
1076  // need to copy Succ into this predecessor. In the presence
1077  // of a trellis tail duplication can continue to be profitable.
1078  // For example:
1079  // A A
1080  // |\ |\
1081  // | \ | \
1082  // | C | C+BB
1083  // | / | |
1084  // |/ | |
1085  // BB => BB |
1086  // |\ |\/|
1087  // | \ |/\|
1088  // | D | D
1089  // | / | /
1090  // |/ |/
1091  // Succ Succ
1092  //
1093  // After BB was duplicated into C, the layout looks like the one on the
1094  // right. BB and C now have the same successors. When considering
1095  // whether Succ can be duplicated into all its unplaced predecessors, we
1096  // ignore C.
1097  // We can do this because C already has a profitable fallthrough, namely
1098  // D. TODO(iteratee): ignore sufficiently cold predecessors for
1099  // duplication and for this test.
1100  //
1101  // This allows trellises to be laid out in 2 separate chains
1102  // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1103  // because it allows the creation of 2 fallthrough paths with links
1104  // between them, and we correctly identify the best layout for these
1105  // CFGs. We want to extend trellises that the user created in addition
1106  // to trellises created by tail-duplication, so we just look for the
1107  // CFG.
1108  continue;
1109  return false;
1110  }
1111  }
1112  return true;
1113 }
1114 
1115 /// Find chains of triangles where we believe it would be profitable to
1116 /// tail-duplicate them all, but a local analysis would not find them.
1117 /// There are 3 ways this can be profitable:
1118 /// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1119 /// longer chains)
1120 /// 2) The chains are statically correlated. Branch probabilities have a very
1121 /// U-shaped distribution.
1122 /// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1123 /// If the branches in a chain are likely to be from the same side of the
1124 /// distribution as their predecessor, but are independent at runtime, this
1125 /// transformation is profitable. (Because the cost of being wrong is a small
1126 /// fixed cost, unlike the standard triangle layout where the cost of being
1127 /// wrong scales with the # of triangles.)
1128 /// 3) The chains are dynamically correlated. If the probability that a previous
1129 /// branch was taken positively influences whether the next branch will be
1130 /// taken
1131 /// We believe that 2 and 3 are common enough to justify the small margin in 1.
1132 void MachineBlockPlacement::precomputeTriangleChains() {
1133  struct TriangleChain {
1134  std::vector<MachineBasicBlock *> Edges;
1135 
1136  TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1137  : Edges({src, dst}) {}
1138 
1139  void append(MachineBasicBlock *dst) {
1140  assert(getKey()->isSuccessor(dst) &&
1141  "Attempting to append a block that is not a successor.");
1142  Edges.push_back(dst);
1143  }
1144 
1145  unsigned count() const { return Edges.size() - 1; }
1146 
1147  MachineBasicBlock *getKey() const {
1148  return Edges.back();
1149  }
1150  };
1151 
1152  if (TriangleChainCount == 0)
1153  return;
1154 
1155  LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1156  // Map from last block to the chain that contains it. This allows us to extend
1157  // chains as we find new triangles.
1159  for (MachineBasicBlock &BB : *F) {
1160  // If BB doesn't have 2 successors, it doesn't start a triangle.
1161  if (BB.succ_size() != 2)
1162  continue;
1163  MachineBasicBlock *PDom = nullptr;
1164  for (MachineBasicBlock *Succ : BB.successors()) {
1165  if (!MPDT->dominates(Succ, &BB))
1166  continue;
1167  PDom = Succ;
1168  break;
1169  }
1170  // If BB doesn't have a post-dominating successor, it doesn't form a
1171  // triangle.
1172  if (PDom == nullptr)
1173  continue;
1174  // If PDom has a hint that it is low probability, skip this triangle.
1175  if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1176  continue;
1177  // If PDom isn't eligible for duplication, this isn't the kind of triangle
1178  // we're looking for.
1179  if (!shouldTailDuplicate(PDom))
1180  continue;
1181  bool CanTailDuplicate = true;
1182  // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1183  // isn't the kind of triangle we're looking for.
1184  for (MachineBasicBlock* Pred : PDom->predecessors()) {
1185  if (Pred == &BB)
1186  continue;
1187  if (!TailDup.canTailDuplicate(PDom, Pred)) {
1188  CanTailDuplicate = false;
1189  break;
1190  }
1191  }
1192  // If we can't tail-duplicate PDom to its predecessors, then skip this
1193  // triangle.
1194  if (!CanTailDuplicate)
1195  continue;
1196 
1197  // Now we have an interesting triangle. Insert it if it's not part of an
1198  // existing chain.
1199  // Note: This cannot be replaced with a call insert() or emplace() because
1200  // the find key is BB, but the insert/emplace key is PDom.
1201  auto Found = TriangleChainMap.find(&BB);
1202  // If it is, remove the chain from the map, grow it, and put it back in the
1203  // map with the end as the new key.
1204  if (Found != TriangleChainMap.end()) {
1205  TriangleChain Chain = std::move(Found->second);
1206  TriangleChainMap.erase(Found);
1207  Chain.append(PDom);
1208  TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1209  } else {
1210  auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1211  assert(InsertResult.second && "Block seen twice.");
1212  (void)InsertResult;
1213  }
1214  }
1215 
1216  // Iterating over a DenseMap is safe here, because the only thing in the body
1217  // of the loop is inserting into another DenseMap (ComputedEdges).
1218  // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1219  for (auto &ChainPair : TriangleChainMap) {
1220  TriangleChain &Chain = ChainPair.second;
1221  // Benchmarking has shown that due to branch correlation duplicating 2 or
1222  // more triangles is profitable, despite the calculations assuming
1223  // independence.
1224  if (Chain.count() < TriangleChainCount)
1225  continue;
1226  MachineBasicBlock *dst = Chain.Edges.back();
1227  Chain.Edges.pop_back();
1228  for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1229  LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1230  << getBlockName(dst)
1231  << " as pre-computed based on triangles.\n");
1232 
1233  auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1234  assert(InsertResult.second && "Block seen twice.");
1235  (void)InsertResult;
1236 
1237  dst = src;
1238  }
1239  }
1240 }
1241 
1242 // When profile is not present, return the StaticLikelyProb.
1243 // When profile is available, we need to handle the triangle-shape CFG.
1245  const MachineBasicBlock *BB) {
1246  if (!BB->getParent()->getFunction().hasProfileData())
1247  return BranchProbability(StaticLikelyProb, 100);
1248  if (BB->succ_size() == 2) {
1249  const MachineBasicBlock *Succ1 = *BB->succ_begin();
1250  const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1251  if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1252  /* See case 1 below for the cost analysis. For BB->Succ to
1253  * be taken with smaller cost, the following needs to hold:
1254  * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1255  * So the threshold T in the calculation below
1256  * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1257  * So T / (1 - T) = 2, Yielding T = 2/3
1258  * Also adding user specified branch bias, we have
1259  * T = (2/3)*(ProfileLikelyProb/50)
1260  * = (2*ProfileLikelyProb)/150)
1261  */
1262  return BranchProbability(2 * ProfileLikelyProb, 150);
1263  }
1264  }
1265  return BranchProbability(ProfileLikelyProb, 100);
1266 }
1267 
1268 /// Checks to see if the layout candidate block \p Succ has a better layout
1269 /// predecessor than \c BB. If yes, returns true.
1270 /// \p SuccProb: The probability adjusted for only remaining blocks.
1271 /// Only used for logging
1272 /// \p RealSuccProb: The un-adjusted probability.
1273 /// \p Chain: The chain that BB belongs to and Succ is being considered for.
1274 /// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1275 /// considered
1276 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1277  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1278  const BlockChain &SuccChain, BranchProbability SuccProb,
1279  BranchProbability RealSuccProb, const BlockChain &Chain,
1280  const BlockFilterSet *BlockFilter) {
1281 
1282  // There isn't a better layout when there are no unscheduled predecessors.
1283  if (SuccChain.UnscheduledPredecessors == 0)
1284  return false;
1285 
1286  // There are two basic scenarios here:
1287  // -------------------------------------
1288  // Case 1: triangular shape CFG (if-then):
1289  // BB
1290  // | \
1291  // | \
1292  // | Pred
1293  // | /
1294  // Succ
1295  // In this case, we are evaluating whether to select edge -> Succ, e.g.
1296  // set Succ as the layout successor of BB. Picking Succ as BB's
1297  // successor breaks the CFG constraints (FIXME: define these constraints).
1298  // With this layout, Pred BB
1299  // is forced to be outlined, so the overall cost will be cost of the
1300  // branch taken from BB to Pred, plus the cost of back taken branch
1301  // from Pred to Succ, as well as the additional cost associated
1302  // with the needed unconditional jump instruction from Pred To Succ.
1303 
1304  // The cost of the topological order layout is the taken branch cost
1305  // from BB to Succ, so to make BB->Succ a viable candidate, the following
1306  // must hold:
1307  // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1308  // < freq(BB->Succ) * taken_branch_cost.
1309  // Ignoring unconditional jump cost, we get
1310  // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1311  // prob(BB->Succ) > 2 * prob(BB->Pred)
1312  //
1313  // When real profile data is available, we can precisely compute the
1314  // probability threshold that is needed for edge BB->Succ to be considered.
1315  // Without profile data, the heuristic requires the branch bias to be
1316  // a lot larger to make sure the signal is very strong (e.g. 80% default).
1317  // -----------------------------------------------------------------
1318  // Case 2: diamond like CFG (if-then-else):
1319  // S
1320  // / \
1321  // | \
1322  // BB Pred
1323  // \ /
1324  // Succ
1325  // ..
1326  //
1327  // The current block is BB and edge BB->Succ is now being evaluated.
1328  // Note that edge S->BB was previously already selected because
1329  // prob(S->BB) > prob(S->Pred).
1330  // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1331  // choose Pred, we will have a topological ordering as shown on the left
1332  // in the picture below. If we choose Succ, we have the solution as shown
1333  // on the right:
1334  //
1335  // topo-order:
1336  //
1337  // S----- ---S
1338  // | | | |
1339  // ---BB | | BB
1340  // | | | |
1341  // | Pred-- | Succ--
1342  // | | | |
1343  // ---Succ ---Pred--
1344  //
1345  // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1346  // = freq(S->Pred) + freq(S->BB)
1347  //
1348  // If we have profile data (i.e, branch probabilities can be trusted), the
1349  // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1350  // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1351  // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1352  // means the cost of topological order is greater.
1353  // When profile data is not available, however, we need to be more
1354  // conservative. If the branch prediction is wrong, breaking the topo-order
1355  // will actually yield a layout with large cost. For this reason, we need
1356  // strong biased branch at block S with Prob(S->BB) in order to select
1357  // BB->Succ. This is equivalent to looking the CFG backward with backward
1358  // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1359  // profile data).
1360  // --------------------------------------------------------------------------
1361  // Case 3: forked diamond
1362  // S
1363  // / \
1364  // / \
1365  // BB Pred
1366  // | \ / |
1367  // | \ / |
1368  // | X |
1369  // | / \ |
1370  // | / \ |
1371  // S1 S2
1372  //
1373  // The current block is BB and edge BB->S1 is now being evaluated.
1374  // As above S->BB was already selected because
1375  // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1376  //
1377  // topo-order:
1378  //
1379  // S-------| ---S
1380  // | | | |
1381  // ---BB | | BB
1382  // | | | |
1383  // | Pred----| | S1----
1384  // | | | |
1385  // --(S1 or S2) ---Pred--
1386  // |
1387  // S2
1388  //
1389  // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1390  // + min(freq(Pred->S1), freq(Pred->S2))
1391  // Non-topo-order cost:
1392  // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1393  // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1394  // is 0. Then the non topo layout is better when
1395  // freq(S->Pred) < freq(BB->S1).
1396  // This is exactly what is checked below.
1397  // Note there are other shapes that apply (Pred may not be a single block,
1398  // but they all fit this general pattern.)
1400 
1401  // Make sure that a hot successor doesn't have a globally more
1402  // important predecessor.
1403  BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1404  bool BadCFGConflict = false;
1405 
1406  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1407  if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
1408  (BlockFilter && !BlockFilter->count(Pred)) ||
1409  BlockToChain[Pred] == &Chain ||
1410  // This check is redundant except for look ahead. This function is
1411  // called for lookahead by isProfitableToTailDup when BB hasn't been
1412  // placed yet.
1413  (Pred == BB))
1414  continue;
1415  // Do backward checking.
1416  // For all cases above, we need a backward checking to filter out edges that
1417  // are not 'strongly' biased.
1418  // BB Pred
1419  // \ /
1420  // Succ
1421  // We select edge BB->Succ if
1422  // freq(BB->Succ) > freq(Succ) * HotProb
1423  // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1424  // HotProb
1425  // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1426  // Case 1 is covered too, because the first equation reduces to:
1427  // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1428  BlockFrequency PredEdgeFreq =
1429  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1430  if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1431  BadCFGConflict = true;
1432  break;
1433  }
1434  }
1435 
1436  if (BadCFGConflict) {
1437  LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1438  << SuccProb << " (prob) (non-cold CFG conflict)\n");
1439  return true;
1440  }
1441 
1442  return false;
1443 }
1444 
1445 /// Select the best successor for a block.
1446 ///
1447 /// This looks across all successors of a particular block and attempts to
1448 /// select the "best" one to be the layout successor. It only considers direct
1449 /// successors which also pass the block filter. It will attempt to avoid
1450 /// breaking CFG structure, but cave and break such structures in the case of
1451 /// very hot successor edges.
1452 ///
1453 /// \returns The best successor block found, or null if none are viable, along
1454 /// with a boolean indicating if tail duplication is necessary.
1455 MachineBlockPlacement::BlockAndTailDupResult
1456 MachineBlockPlacement::selectBestSuccessor(
1457  const MachineBasicBlock *BB, const BlockChain &Chain,
1458  const BlockFilterSet *BlockFilter) {
1459  const BranchProbability HotProb(StaticLikelyProb, 100);
1460 
1461  BlockAndTailDupResult BestSucc = { nullptr, false };
1462  auto BestProb = BranchProbability::getZero();
1463 
1465  auto AdjustedSumProb =
1466  collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1467 
1468  LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1469  << "\n");
1470 
1471  // if we already precomputed the best successor for BB, return that if still
1472  // applicable.
1473  auto FoundEdge = ComputedEdges.find(BB);
1474  if (FoundEdge != ComputedEdges.end()) {
1475  MachineBasicBlock *Succ = FoundEdge->second.BB;
1476  ComputedEdges.erase(FoundEdge);
1477  BlockChain *SuccChain = BlockToChain[Succ];
1478  if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1479  SuccChain != &Chain && Succ == *SuccChain->begin())
1480  return FoundEdge->second;
1481  }
1482 
1483  // if BB is part of a trellis, Use the trellis to determine the optimal
1484  // fallthrough edges
1485  if (isTrellis(BB, Successors, Chain, BlockFilter))
1486  return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1487  BlockFilter);
1488 
1489  // For blocks with CFG violations, we may be able to lay them out anyway with
1490  // tail-duplication. We keep this vector so we can perform the probability
1491  // calculations the minimum number of times.
1493  DupCandidates;
1494  for (MachineBasicBlock *Succ : Successors) {
1495  auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1496  BranchProbability SuccProb =
1497  getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1498 
1499  BlockChain &SuccChain = *BlockToChain[Succ];
1500  // Skip the edge \c BB->Succ if block \c Succ has a better layout
1501  // predecessor that yields lower global cost.
1502  if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1503  Chain, BlockFilter)) {
1504  // If tail duplication would make Succ profitable, place it.
1505  if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1506  DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
1507  continue;
1508  }
1509 
1510  LLVM_DEBUG(
1511  dbgs() << " Candidate: " << getBlockName(Succ)
1512  << ", probability: " << SuccProb
1513  << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1514  << "\n");
1515 
1516  if (BestSucc.BB && BestProb >= SuccProb) {
1517  LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1518  continue;
1519  }
1520 
1521  LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1522  BestSucc.BB = Succ;
1523  BestProb = SuccProb;
1524  }
1525  // Handle the tail duplication candidates in order of decreasing probability.
1526  // Stop at the first one that is profitable. Also stop if they are less
1527  // profitable than BestSucc. Position is important because we preserve it and
1528  // prefer first best match. Here we aren't comparing in order, so we capture
1529  // the position instead.
1530  if (DupCandidates.size() != 0) {
1531  auto cmp =
1532  [](const std::tuple<BranchProbability, MachineBasicBlock *> &a,
1533  const std::tuple<BranchProbability, MachineBasicBlock *> &b) {
1534  return std::get<0>(a) > std::get<0>(b);
1535  };
1536  std::stable_sort(DupCandidates.begin(), DupCandidates.end(), cmp);
1537  }
1538  for(auto &Tup : DupCandidates) {
1539  BranchProbability DupProb;
1540  MachineBasicBlock *Succ;
1541  std::tie(DupProb, Succ) = Tup;
1542  if (DupProb < BestProb)
1543  break;
1544  if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1545  && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1546  LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1547  << ", probability: " << DupProb
1548  << " (Tail Duplicate)\n");
1549  BestSucc.BB = Succ;
1550  BestSucc.ShouldTailDup = true;
1551  break;
1552  }
1553  }
1554 
1555  if (BestSucc.BB)
1556  LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1557 
1558  return BestSucc;
1559 }
1560 
1561 /// Select the best block from a worklist.
1562 ///
1563 /// This looks through the provided worklist as a list of candidate basic
1564 /// blocks and select the most profitable one to place. The definition of
1565 /// profitable only really makes sense in the context of a loop. This returns
1566 /// the most frequently visited block in the worklist, which in the case of
1567 /// a loop, is the one most desirable to be physically close to the rest of the
1568 /// loop body in order to improve i-cache behavior.
1569 ///
1570 /// \returns The best block found, or null if none are viable.
1571 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1572  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1573  // Once we need to walk the worklist looking for a candidate, cleanup the
1574  // worklist of already placed entries.
1575  // FIXME: If this shows up on profiles, it could be folded (at the cost of
1576  // some code complexity) into the loop below.
1577  WorkList.erase(llvm::remove_if(WorkList,
1578  [&](MachineBasicBlock *BB) {
1579  return BlockToChain.lookup(BB) == &Chain;
1580  }),
1581  WorkList.end());
1582 
1583  if (WorkList.empty())
1584  return nullptr;
1585 
1586  bool IsEHPad = WorkList[0]->isEHPad();
1587 
1588  MachineBasicBlock *BestBlock = nullptr;
1589  BlockFrequency BestFreq;
1590  for (MachineBasicBlock *MBB : WorkList) {
1591  assert(MBB->isEHPad() == IsEHPad &&
1592  "EHPad mismatch between block and work list.");
1593 
1594  BlockChain &SuccChain = *BlockToChain[MBB];
1595  if (&SuccChain == &Chain)
1596  continue;
1597 
1598  assert(SuccChain.UnscheduledPredecessors == 0 &&
1599  "Found CFG-violating block");
1600 
1601  BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1602  LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> ";
1603  MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
1604 
1605  // For ehpad, we layout the least probable first as to avoid jumping back
1606  // from least probable landingpads to more probable ones.
1607  //
1608  // FIXME: Using probability is probably (!) not the best way to achieve
1609  // this. We should probably have a more principled approach to layout
1610  // cleanup code.
1611  //
1612  // The goal is to get:
1613  //
1614  // +--------------------------+
1615  // | V
1616  // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1617  //
1618  // Rather than:
1619  //
1620  // +-------------------------------------+
1621  // V |
1622  // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1623  if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1624  continue;
1625 
1626  BestBlock = MBB;
1627  BestFreq = CandidateFreq;
1628  }
1629 
1630  return BestBlock;
1631 }
1632 
1633 /// Retrieve the first unplaced basic block.
1634 ///
1635 /// This routine is called when we are unable to use the CFG to walk through
1636 /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1637 /// We walk through the function's blocks in order, starting from the
1638 /// LastUnplacedBlockIt. We update this iterator on each call to avoid
1639 /// re-scanning the entire sequence on repeated calls to this routine.
1640 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1641  const BlockChain &PlacedChain,
1642  MachineFunction::iterator &PrevUnplacedBlockIt,
1643  const BlockFilterSet *BlockFilter) {
1644  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1645  ++I) {
1646  if (BlockFilter && !BlockFilter->count(&*I))
1647  continue;
1648  if (BlockToChain[&*I] != &PlacedChain) {
1649  PrevUnplacedBlockIt = I;
1650  // Now select the head of the chain to which the unplaced block belongs
1651  // as the block to place. This will force the entire chain to be placed,
1652  // and satisfies the requirements of merging chains.
1653  return *BlockToChain[&*I]->begin();
1654  }
1655  }
1656  return nullptr;
1657 }
1658 
1659 void MachineBlockPlacement::fillWorkLists(
1660  const MachineBasicBlock *MBB,
1661  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1662  const BlockFilterSet *BlockFilter = nullptr) {
1663  BlockChain &Chain = *BlockToChain[MBB];
1664  if (!UpdatedPreds.insert(&Chain).second)
1665  return;
1666 
1667  assert(
1668  Chain.UnscheduledPredecessors == 0 &&
1669  "Attempting to place block with unscheduled predecessors in worklist.");
1670  for (MachineBasicBlock *ChainBB : Chain) {
1671  assert(BlockToChain[ChainBB] == &Chain &&
1672  "Block in chain doesn't match BlockToChain map.");
1673  for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1674  if (BlockFilter && !BlockFilter->count(Pred))
1675  continue;
1676  if (BlockToChain[Pred] == &Chain)
1677  continue;
1678  ++Chain.UnscheduledPredecessors;
1679  }
1680  }
1681 
1682  if (Chain.UnscheduledPredecessors != 0)
1683  return;
1684 
1685  MachineBasicBlock *BB = *Chain.begin();
1686  if (BB->isEHPad())
1687  EHPadWorkList.push_back(BB);
1688  else
1689  BlockWorkList.push_back(BB);
1690 }
1691 
1692 void MachineBlockPlacement::buildChain(
1693  const MachineBasicBlock *HeadBB, BlockChain &Chain,
1694  BlockFilterSet *BlockFilter) {
1695  assert(HeadBB && "BB must not be null.\n");
1696  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1697  MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1698 
1699  const MachineBasicBlock *LoopHeaderBB = HeadBB;
1700  markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1701  MachineBasicBlock *BB = *std::prev(Chain.end());
1702  while (true) {
1703  assert(BB && "null block found at end of chain in loop.");
1704  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1705  assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1706 
1707 
1708  // Look for the best viable successor if there is one to place immediately
1709  // after this block.
1710  auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1711  MachineBasicBlock* BestSucc = Result.BB;
1712  bool ShouldTailDup = Result.ShouldTailDup;
1713  if (allowTailDupPlacement())
1714  ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc));
1715 
1716  // If an immediate successor isn't available, look for the best viable
1717  // block among those we've identified as not violating the loop's CFG at
1718  // this point. This won't be a fallthrough, but it will increase locality.
1719  if (!BestSucc)
1720  BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1721  if (!BestSucc)
1722  BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1723 
1724  if (!BestSucc) {
1725  BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
1726  if (!BestSucc)
1727  break;
1728 
1729  LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1730  "layout successor until the CFG reduces\n");
1731  }
1732 
1733  // Placement may have changed tail duplication opportunities.
1734  // Check for that now.
1735  if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1736  // If the chosen successor was duplicated into all its predecessors,
1737  // don't bother laying it out, just go round the loop again with BB as
1738  // the chain end.
1739  if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1740  BlockFilter, PrevUnplacedBlockIt))
1741  continue;
1742  }
1743 
1744  // Place this block, updating the datastructures to reflect its placement.
1745  BlockChain &SuccChain = *BlockToChain[BestSucc];
1746  // Zero out UnscheduledPredecessors for the successor we're about to merge in case
1747  // we selected a successor that didn't fit naturally into the CFG.
1748  SuccChain.UnscheduledPredecessors = 0;
1749  LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1750  << getBlockName(BestSucc) << "\n");
1751  markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1752  Chain.merge(BestSucc, &SuccChain);
1753  BB = *std::prev(Chain.end());
1754  }
1755 
1756  LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1757  << getBlockName(*Chain.begin()) << "\n");
1758 }
1759 
1760 /// Find the best loop top block for layout.
1761 ///
1762 /// Look for a block which is strictly better than the loop header for laying
1763 /// out at the top of the loop. This looks for one and only one pattern:
1764 /// a latch block with no conditional exit. This block will cause a conditional
1765 /// jump around it or will be the bottom of the loop if we lay it out in place,
1766 /// but if it it doesn't end up at the bottom of the loop for any reason,
1767 /// rotation alone won't fix it. Because such a block will always result in an
1768 /// unconditional jump (for the backedge) rotating it in front of the loop
1769 /// header is always profitable.
1771 MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
1772  const BlockFilterSet &LoopBlockSet) {
1773  // Placing the latch block before the header may introduce an extra branch
1774  // that skips this block the first time the loop is executed, which we want
1775  // to avoid when optimising for size.
1776  // FIXME: in theory there is a case that does not introduce a new branch,
1777  // i.e. when the layout predecessor does not fallthrough to the loop header.
1778  // In practice this never happens though: there always seems to be a preheader
1779  // that can fallthrough and that is also placed before the header.
1780  if (F->getFunction().optForSize())
1781  return L.getHeader();
1782 
1783  // Check that the header hasn't been fused with a preheader block due to
1784  // crazy branches. If it has, we need to start with the header at the top to
1785  // prevent pulling the preheader into the loop body.
1786  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
1787  if (!LoopBlockSet.count(*HeaderChain.begin()))
1788  return L.getHeader();
1789 
1790  LLVM_DEBUG(dbgs() << "Finding best loop top for: "
1791  << getBlockName(L.getHeader()) << "\n");
1792 
1793  BlockFrequency BestPredFreq;
1794  MachineBasicBlock *BestPred = nullptr;
1795  for (MachineBasicBlock *Pred : L.getHeader()->predecessors()) {
1796  if (!LoopBlockSet.count(Pred))
1797  continue;
1798  LLVM_DEBUG(dbgs() << " header pred: " << getBlockName(Pred) << ", has "
1799  << Pred->succ_size() << " successors, ";
1800  MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
1801  if (Pred->succ_size() > 1)
1802  continue;
1803 
1804  BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
1805  if (!BestPred || PredFreq > BestPredFreq ||
1806  (!(PredFreq < BestPredFreq) &&
1807  Pred->isLayoutSuccessor(L.getHeader()))) {
1808  BestPred = Pred;
1809  BestPredFreq = PredFreq;
1810  }
1811  }
1812 
1813  // If no direct predecessor is fine, just use the loop header.
1814  if (!BestPred) {
1815  LLVM_DEBUG(dbgs() << " final top unchanged\n");
1816  return L.getHeader();
1817  }
1818 
1819  // Walk backwards through any straight line of predecessors.
1820  while (BestPred->pred_size() == 1 &&
1821  (*BestPred->pred_begin())->succ_size() == 1 &&
1822  *BestPred->pred_begin() != L.getHeader())
1823  BestPred = *BestPred->pred_begin();
1824 
1825  LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
1826  return BestPred;
1827 }
1828 
1829 /// Find the best loop exiting block for layout.
1830 ///
1831 /// This routine implements the logic to analyze the loop looking for the best
1832 /// block to layout at the top of the loop. Typically this is done to maximize
1833 /// fallthrough opportunities.
1835 MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
1836  const BlockFilterSet &LoopBlockSet) {
1837  // We don't want to layout the loop linearly in all cases. If the loop header
1838  // is just a normal basic block in the loop, we want to look for what block
1839  // within the loop is the best one to layout at the top. However, if the loop
1840  // header has be pre-merged into a chain due to predecessors not having
1841  // analyzable branches, *and* the predecessor it is merged with is *not* part
1842  // of the loop, rotating the header into the middle of the loop will create
1843  // a non-contiguous range of blocks which is Very Bad. So start with the
1844  // header and only rotate if safe.
1845  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
1846  if (!LoopBlockSet.count(*HeaderChain.begin()))
1847  return nullptr;
1848 
1849  BlockFrequency BestExitEdgeFreq;
1850  unsigned BestExitLoopDepth = 0;
1851  MachineBasicBlock *ExitingBB = nullptr;
1852  // If there are exits to outer loops, loop rotation can severely limit
1853  // fallthrough opportunities unless it selects such an exit. Keep a set of
1854  // blocks where rotating to exit with that block will reach an outer loop.
1855  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
1856 
1857  LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
1858  << getBlockName(L.getHeader()) << "\n");
1859  for (MachineBasicBlock *MBB : L.getBlocks()) {
1860  BlockChain &Chain = *BlockToChain[MBB];
1861  // Ensure that this block is at the end of a chain; otherwise it could be
1862  // mid-way through an inner loop or a successor of an unanalyzable branch.
1863  if (MBB != *std::prev(Chain.end()))
1864  continue;
1865 
1866  // Now walk the successors. We need to establish whether this has a viable
1867  // exiting successor and whether it has a viable non-exiting successor.
1868  // We store the old exiting state and restore it if a viable looping
1869  // successor isn't found.
1870  MachineBasicBlock *OldExitingBB = ExitingBB;
1871  BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
1872  bool HasLoopingSucc = false;
1873  for (MachineBasicBlock *Succ : MBB->successors()) {
1874  if (Succ->isEHPad())
1875  continue;
1876  if (Succ == MBB)
1877  continue;
1878  BlockChain &SuccChain = *BlockToChain[Succ];
1879  // Don't split chains, either this chain or the successor's chain.
1880  if (&Chain == &SuccChain) {
1881  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
1882  << getBlockName(Succ) << " (chain conflict)\n");
1883  continue;
1884  }
1885 
1886  auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
1887  if (LoopBlockSet.count(Succ)) {
1888  LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
1889  << getBlockName(Succ) << " (" << SuccProb << ")\n");
1890  HasLoopingSucc = true;
1891  continue;
1892  }
1893 
1894  unsigned SuccLoopDepth = 0;
1895  if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
1896  SuccLoopDepth = ExitLoop->getLoopDepth();
1897  if (ExitLoop->contains(&L))
1898  BlocksExitingToOuterLoop.insert(MBB);
1899  }
1900 
1901  BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
1902  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
1903  << getBlockName(Succ) << " [L:" << SuccLoopDepth
1904  << "] (";
1905  MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
1906  // Note that we bias this toward an existing layout successor to retain
1907  // incoming order in the absence of better information. The exit must have
1908  // a frequency higher than the current exit before we consider breaking
1909  // the layout.
1910  BranchProbability Bias(100 - ExitBlockBias, 100);
1911  if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
1912  ExitEdgeFreq > BestExitEdgeFreq ||
1913  (MBB->isLayoutSuccessor(Succ) &&
1914  !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
1915  BestExitEdgeFreq = ExitEdgeFreq;
1916  ExitingBB = MBB;
1917  }
1918  }
1919 
1920  if (!HasLoopingSucc) {
1921  // Restore the old exiting state, no viable looping successor was found.
1922  ExitingBB = OldExitingBB;
1923  BestExitEdgeFreq = OldBestExitEdgeFreq;
1924  }
1925  }
1926  // Without a candidate exiting block or with only a single block in the
1927  // loop, just use the loop header to layout the loop.
1928  if (!ExitingBB) {
1929  LLVM_DEBUG(
1930  dbgs() << " No other candidate exit blocks, using loop header\n");
1931  return nullptr;
1932  }
1933  if (L.getNumBlocks() == 1) {
1934  LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
1935  return nullptr;
1936  }
1937 
1938  // Also, if we have exit blocks which lead to outer loops but didn't select
1939  // one of them as the exiting block we are rotating toward, disable loop
1940  // rotation altogether.
1941  if (!BlocksExitingToOuterLoop.empty() &&
1942  !BlocksExitingToOuterLoop.count(ExitingBB))
1943  return nullptr;
1944 
1945  LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
1946  << "\n");
1947  return ExitingBB;
1948 }
1949 
1950 /// Attempt to rotate an exiting block to the bottom of the loop.
1951 ///
1952 /// Once we have built a chain, try to rotate it to line up the hot exit block
1953 /// with fallthrough out of the loop if doing so doesn't introduce unnecessary
1954 /// branches. For example, if the loop has fallthrough into its header and out
1955 /// of its bottom already, don't rotate it.
1956 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
1957  const MachineBasicBlock *ExitingBB,
1958  const BlockFilterSet &LoopBlockSet) {
1959  if (!ExitingBB)
1960  return;
1961 
1962  MachineBasicBlock *Top = *LoopChain.begin();
1963  MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
1964 
1965  // If ExitingBB is already the last one in a chain then nothing to do.
1966  if (Bottom == ExitingBB)
1967  return;
1968 
1969  bool ViableTopFallthrough = false;
1970  for (MachineBasicBlock *Pred : Top->predecessors()) {
1971  BlockChain *PredChain = BlockToChain[Pred];
1972  if (!LoopBlockSet.count(Pred) &&
1973  (!PredChain || Pred == *std::prev(PredChain->end()))) {
1974  ViableTopFallthrough = true;
1975  break;
1976  }
1977  }
1978 
1979  // If the header has viable fallthrough, check whether the current loop
1980  // bottom is a viable exiting block. If so, bail out as rotating will
1981  // introduce an unnecessary branch.
1982  if (ViableTopFallthrough) {
1983  for (MachineBasicBlock *Succ : Bottom->successors()) {
1984  BlockChain *SuccChain = BlockToChain[Succ];
1985  if (!LoopBlockSet.count(Succ) &&
1986  (!SuccChain || Succ == *SuccChain->begin()))
1987  return;
1988  }
1989  }
1990 
1991  BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
1992  if (ExitIt == LoopChain.end())
1993  return;
1994 
1995  // Rotating a loop exit to the bottom when there is a fallthrough to top
1996  // trades the entry fallthrough for an exit fallthrough.
1997  // If there is no bottom->top edge, but the chosen exit block does have
1998  // a fallthrough, we break that fallthrough for nothing in return.
1999 
2000  // Let's consider an example. We have a built chain of basic blocks
2001  // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2002  // By doing a rotation we get
2003  // Bk+1, ..., Bn, B1, ..., Bk
2004  // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2005  // If we had a fallthrough Bk -> Bk+1 it is broken now.
2006  // It might be compensated by fallthrough Bn -> B1.
2007  // So we have a condition to avoid creation of extra branch by loop rotation.
2008  // All below must be true to avoid loop rotation:
2009  // If there is a fallthrough to top (B1)
2010  // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2011  // There is no fallthrough from bottom (Bn) to top (B1).
2012  // Please note that there is no exit fallthrough from Bn because we checked it
2013  // above.
2014  if (ViableTopFallthrough) {
2015  assert(std::next(ExitIt) != LoopChain.end() &&
2016  "Exit should not be last BB");
2017  MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2018  if (ExitingBB->isSuccessor(NextBlockInChain))
2019  if (!Bottom->isSuccessor(Top))
2020  return;
2021  }
2022 
2023  LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2024  << " at bottom\n");
2025  std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2026 }
2027 
2028 /// Attempt to rotate a loop based on profile data to reduce branch cost.
2029 ///
2030 /// With profile data, we can determine the cost in terms of missed fall through
2031 /// opportunities when rotating a loop chain and select the best rotation.
2032 /// Basically, there are three kinds of cost to consider for each rotation:
2033 /// 1. The possibly missed fall through edge (if it exists) from BB out of
2034 /// the loop to the loop header.
2035 /// 2. The possibly missed fall through edges (if they exist) from the loop
2036 /// exits to BB out of the loop.
2037 /// 3. The missed fall through edge (if it exists) from the last BB to the
2038 /// first BB in the loop chain.
2039 /// Therefore, the cost for a given rotation is the sum of costs listed above.
2040 /// We select the best rotation with the smallest cost.
2041 void MachineBlockPlacement::rotateLoopWithProfile(
2042  BlockChain &LoopChain, const MachineLoop &L,
2043  const BlockFilterSet &LoopBlockSet) {
2044  auto HeaderBB = L.getHeader();
2045  auto HeaderIter = llvm::find(LoopChain, HeaderBB);
2046  auto RotationPos = LoopChain.end();
2047 
2048  BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
2049 
2050  // A utility lambda that scales up a block frequency by dividing it by a
2051  // branch probability which is the reciprocal of the scale.
2052  auto ScaleBlockFrequency = [](BlockFrequency Freq,
2053  unsigned Scale) -> BlockFrequency {
2054  if (Scale == 0)
2055  return 0;
2056  // Use operator / between BlockFrequency and BranchProbability to implement
2057  // saturating multiplication.
2058  return Freq / BranchProbability(1, Scale);
2059  };
2060 
2061  // Compute the cost of the missed fall-through edge to the loop header if the
2062  // chain head is not the loop header. As we only consider natural loops with
2063  // single header, this computation can be done only once.
2064  BlockFrequency HeaderFallThroughCost(0);
2065  for (auto *Pred : HeaderBB->predecessors()) {
2066  BlockChain *PredChain = BlockToChain[Pred];
2067  if (!LoopBlockSet.count(Pred) &&
2068  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2069  auto EdgeFreq =
2070  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB);
2071  auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2072  // If the predecessor has only an unconditional jump to the header, we
2073  // need to consider the cost of this jump.
2074  if (Pred->succ_size() == 1)
2075  FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2076  HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2077  }
2078  }
2079 
2080  // Here we collect all exit blocks in the loop, and for each exit we find out
2081  // its hottest exit edge. For each loop rotation, we define the loop exit cost
2082  // as the sum of frequencies of exit edges we collect here, excluding the exit
2083  // edge from the tail of the loop chain.
2085  for (auto BB : LoopChain) {
2086  auto LargestExitEdgeProb = BranchProbability::getZero();
2087  for (auto *Succ : BB->successors()) {
2088  BlockChain *SuccChain = BlockToChain[Succ];
2089  if (!LoopBlockSet.count(Succ) &&
2090  (!SuccChain || Succ == *SuccChain->begin())) {
2091  auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2092  LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2093  }
2094  }
2095  if (LargestExitEdgeProb > BranchProbability::getZero()) {
2096  auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2097  ExitsWithFreq.emplace_back(BB, ExitFreq);
2098  }
2099  }
2100 
2101  // In this loop we iterate every block in the loop chain and calculate the
2102  // cost assuming the block is the head of the loop chain. When the loop ends,
2103  // we should have found the best candidate as the loop chain's head.
2104  for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2105  EndIter = LoopChain.end();
2106  Iter != EndIter; Iter++, TailIter++) {
2107  // TailIter is used to track the tail of the loop chain if the block we are
2108  // checking (pointed by Iter) is the head of the chain.
2109  if (TailIter == LoopChain.end())
2110  TailIter = LoopChain.begin();
2111 
2112  auto TailBB = *TailIter;
2113 
2114  // Calculate the cost by putting this BB to the top.
2115  BlockFrequency Cost = 0;
2116 
2117  // If the current BB is the loop header, we need to take into account the
2118  // cost of the missed fall through edge from outside of the loop to the
2119  // header.
2120  if (Iter != HeaderIter)
2121  Cost += HeaderFallThroughCost;
2122 
2123  // Collect the loop exit cost by summing up frequencies of all exit edges
2124  // except the one from the chain tail.
2125  for (auto &ExitWithFreq : ExitsWithFreq)
2126  if (TailBB != ExitWithFreq.first)
2127  Cost += ExitWithFreq.second;
2128 
2129  // The cost of breaking the once fall-through edge from the tail to the top
2130  // of the loop chain. Here we need to consider three cases:
2131  // 1. If the tail node has only one successor, then we will get an
2132  // additional jmp instruction. So the cost here is (MisfetchCost +
2133  // JumpInstCost) * tail node frequency.
2134  // 2. If the tail node has two successors, then we may still get an
2135  // additional jmp instruction if the layout successor after the loop
2136  // chain is not its CFG successor. Note that the more frequently executed
2137  // jmp instruction will be put ahead of the other one. Assume the
2138  // frequency of those two branches are x and y, where x is the frequency
2139  // of the edge to the chain head, then the cost will be
2140  // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2141  // 3. If the tail node has more than two successors (this rarely happens),
2142  // we won't consider any additional cost.
2143  if (TailBB->isSuccessor(*Iter)) {
2144  auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2145  if (TailBB->succ_size() == 1)
2146  Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
2148  else if (TailBB->succ_size() == 2) {
2149  auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2150  auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2151  auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2152  ? TailBBFreq * TailToHeadProb.getCompl()
2153  : TailToHeadFreq;
2154  Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2155  ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2156  }
2157  }
2158 
2159  LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2160  << getBlockName(*Iter)
2161  << " to the top: " << Cost.getFrequency() << "\n");
2162 
2163  if (Cost < SmallestRotationCost) {
2164  SmallestRotationCost = Cost;
2165  RotationPos = Iter;
2166  }
2167  }
2168 
2169  if (RotationPos != LoopChain.end()) {
2170  LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2171  << " to the top\n");
2172  std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2173  }
2174 }
2175 
2176 /// Collect blocks in the given loop that are to be placed.
2177 ///
2178 /// When profile data is available, exclude cold blocks from the returned set;
2179 /// otherwise, collect all blocks in the loop.
2181 MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2182  BlockFilterSet LoopBlockSet;
2183 
2184  // Filter cold blocks off from LoopBlockSet when profile data is available.
2185  // Collect the sum of frequencies of incoming edges to the loop header from
2186  // outside. If we treat the loop as a super block, this is the frequency of
2187  // the loop. Then for each block in the loop, we calculate the ratio between
2188  // its frequency and the frequency of the loop block. When it is too small,
2189  // don't add it to the loop chain. If there are outer loops, then this block
2190  // will be merged into the first outer loop chain for which this block is not
2191  // cold anymore. This needs precise profile data and we only do this when
2192  // profile data is available.
2194  BlockFrequency LoopFreq(0);
2195  for (auto LoopPred : L.getHeader()->predecessors())
2196  if (!L.contains(LoopPred))
2197  LoopFreq += MBFI->getBlockFreq(LoopPred) *
2198  MBPI->getEdgeProbability(LoopPred, L.getHeader());
2199 
2200  for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2201  auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2202  if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2203  continue;
2204  LoopBlockSet.insert(LoopBB);
2205  }
2206  } else
2207  LoopBlockSet.insert(L.block_begin(), L.block_end());
2208 
2209  return LoopBlockSet;
2210 }
2211 
2212 /// Forms basic block chains from the natural loop structures.
2213 ///
2214 /// These chains are designed to preserve the existing *structure* of the code
2215 /// as much as possible. We can then stitch the chains together in a way which
2216 /// both preserves the topological structure and minimizes taken conditional
2217 /// branches.
2218 void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2219  // First recurse through any nested loops, building chains for those inner
2220  // loops.
2221  for (const MachineLoop *InnerLoop : L)
2222  buildLoopChains(*InnerLoop);
2223 
2224  assert(BlockWorkList.empty() &&
2225  "BlockWorkList not empty when starting to build loop chains.");
2226  assert(EHPadWorkList.empty() &&
2227  "EHPadWorkList not empty when starting to build loop chains.");
2228  BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2229 
2230  // Check if we have profile data for this function. If yes, we will rotate
2231  // this loop by modeling costs more precisely which requires the profile data
2232  // for better layout.
2233  bool RotateLoopWithProfile =
2236 
2237  // First check to see if there is an obviously preferable top block for the
2238  // loop. This will default to the header, but may end up as one of the
2239  // predecessors to the header if there is one which will result in strictly
2240  // fewer branches in the loop body.
2241  // When we use profile data to rotate the loop, this is unnecessary.
2242  MachineBasicBlock *LoopTop =
2243  RotateLoopWithProfile ? L.getHeader() : findBestLoopTop(L, LoopBlockSet);
2244 
2245  // If we selected just the header for the loop top, look for a potentially
2246  // profitable exit block in the event that rotating the loop can eliminate
2247  // branches by placing an exit edge at the bottom.
2248  //
2249  // Loops are processed innermost to uttermost, make sure we clear
2250  // PreferredLoopExit before processing a new loop.
2251  PreferredLoopExit = nullptr;
2252  if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2253  PreferredLoopExit = findBestLoopExit(L, LoopBlockSet);
2254 
2255  BlockChain &LoopChain = *BlockToChain[LoopTop];
2256 
2257  // FIXME: This is a really lame way of walking the chains in the loop: we
2258  // walk the blocks, and use a set to prevent visiting a particular chain
2259  // twice.
2260  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2261  assert(LoopChain.UnscheduledPredecessors == 0 &&
2262  "LoopChain should not have unscheduled predecessors.");
2263  UpdatedPreds.insert(&LoopChain);
2264 
2265  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2266  fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2267 
2268  buildChain(LoopTop, LoopChain, &LoopBlockSet);
2269 
2270  if (RotateLoopWithProfile)
2271  rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2272  else
2273  rotateLoop(LoopChain, PreferredLoopExit, LoopBlockSet);
2274 
2275  LLVM_DEBUG({
2276  // Crash at the end so we get all of the debugging output first.
2277  bool BadLoop = false;
2278  if (LoopChain.UnscheduledPredecessors) {
2279  BadLoop = true;
2280  dbgs() << "Loop chain contains a block without its preds placed!\n"
2281  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2282  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2283  }
2284  for (MachineBasicBlock *ChainBB : LoopChain) {
2285  dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2286  if (!LoopBlockSet.remove(ChainBB)) {
2287  // We don't mark the loop as bad here because there are real situations
2288  // where this can occur. For example, with an unanalyzable fallthrough
2289  // from a loop block to a non-loop block or vice versa.
2290  dbgs() << "Loop chain contains a block not contained by the loop!\n"
2291  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2292  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2293  << " Bad block: " << getBlockName(ChainBB) << "\n";
2294  }
2295  }
2296 
2297  if (!LoopBlockSet.empty()) {
2298  BadLoop = true;
2299  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2300  dbgs() << "Loop contains blocks never placed into a chain!\n"
2301  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2302  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2303  << " Bad block: " << getBlockName(LoopBB) << "\n";
2304  }
2305  assert(!BadLoop && "Detected problems with the placement of this loop.");
2306  });
2307 
2308  BlockWorkList.clear();
2309  EHPadWorkList.clear();
2310 }
2311 
2312 void MachineBlockPlacement::buildCFGChains() {
2313  // Ensure that every BB in the function has an associated chain to simplify
2314  // the assumptions of the remaining algorithm.
2315  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2316  for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2317  ++FI) {
2318  MachineBasicBlock *BB = &*FI;
2319  BlockChain *Chain =
2320  new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2321  // Also, merge any blocks which we cannot reason about and must preserve
2322  // the exact fallthrough behavior for.
2323  while (true) {
2324  Cond.clear();
2325  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2326  if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2327  break;
2328 
2329  MachineFunction::iterator NextFI = std::next(FI);
2330  MachineBasicBlock *NextBB = &*NextFI;
2331  // Ensure that the layout successor is a viable block, as we know that
2332  // fallthrough is a possibility.
2333  assert(NextFI != FE && "Can't fallthrough past the last block.");
2334  LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2335  << getBlockName(BB) << " -> " << getBlockName(NextBB)
2336  << "\n");
2337  Chain->merge(NextBB, nullptr);
2338 #ifndef NDEBUG
2339  BlocksWithUnanalyzableExits.insert(&*BB);
2340 #endif
2341  FI = NextFI;
2342  BB = NextBB;
2343  }
2344  }
2345 
2346  // Build any loop-based chains.
2347  PreferredLoopExit = nullptr;
2348  for (MachineLoop *L : *MLI)
2349  buildLoopChains(*L);
2350 
2351  assert(BlockWorkList.empty() &&
2352  "BlockWorkList should be empty before building final chain.");
2353  assert(EHPadWorkList.empty() &&
2354  "EHPadWorkList should be empty before building final chain.");
2355 
2356  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2357  for (MachineBasicBlock &MBB : *F)
2358  fillWorkLists(&MBB, UpdatedPreds);
2359 
2360  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2361  buildChain(&F->front(), FunctionChain);
2362 
2363 #ifndef NDEBUG
2364  using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2365 #endif
2366  LLVM_DEBUG({
2367  // Crash at the end so we get all of the debugging output first.
2368  bool BadFunc = false;
2369  FunctionBlockSetType FunctionBlockSet;
2370  for (MachineBasicBlock &MBB : *F)
2371  FunctionBlockSet.insert(&MBB);
2372 
2373  for (MachineBasicBlock *ChainBB : FunctionChain)
2374  if (!FunctionBlockSet.erase(ChainBB)) {
2375  BadFunc = true;
2376  dbgs() << "Function chain contains a block not in the function!\n"
2377  << " Bad block: " << getBlockName(ChainBB) << "\n";
2378  }
2379 
2380  if (!FunctionBlockSet.empty()) {
2381  BadFunc = true;
2382  for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2383  dbgs() << "Function contains blocks never placed into a chain!\n"
2384  << " Bad block: " << getBlockName(RemainingBB) << "\n";
2385  }
2386  assert(!BadFunc && "Detected problems with the block placement.");
2387  });
2388 
2389  // Splice the blocks into place.
2390  MachineFunction::iterator InsertPos = F->begin();
2391  LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2392  for (MachineBasicBlock *ChainBB : FunctionChain) {
2393  LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2394  : " ... ")
2395  << getBlockName(ChainBB) << "\n");
2396  if (InsertPos != MachineFunction::iterator(ChainBB))
2397  F->splice(InsertPos, ChainBB);
2398  else
2399  ++InsertPos;
2400 
2401  // Update the terminator of the previous block.
2402  if (ChainBB == *FunctionChain.begin())
2403  continue;
2404  MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2405 
2406  // FIXME: It would be awesome of updateTerminator would just return rather
2407  // than assert when the branch cannot be analyzed in order to remove this
2408  // boiler plate.
2409  Cond.clear();
2410  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2411 
2412 #ifndef NDEBUG
2413  if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2414  // Given the exact block placement we chose, we may actually not _need_ to
2415  // be able to edit PrevBB's terminator sequence, but not being _able_ to
2416  // do that at this point is a bug.
2417  assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2418  !PrevBB->canFallThrough()) &&
2419  "Unexpected block with un-analyzable fallthrough!");
2420  Cond.clear();
2421  TBB = FBB = nullptr;
2422  }
2423 #endif
2424 
2425  // The "PrevBB" is not yet updated to reflect current code layout, so,
2426  // o. it may fall-through to a block without explicit "goto" instruction
2427  // before layout, and no longer fall-through it after layout; or
2428  // o. just opposite.
2429  //
2430  // analyzeBranch() may return erroneous value for FBB when these two
2431  // situations take place. For the first scenario FBB is mistakenly set NULL;
2432  // for the 2nd scenario, the FBB, which is expected to be NULL, is
2433  // mistakenly pointing to "*BI".
2434  // Thus, if the future change needs to use FBB before the layout is set, it
2435  // has to correct FBB first by using the code similar to the following:
2436  //
2437  // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2438  // PrevBB->updateTerminator();
2439  // Cond.clear();
2440  // TBB = FBB = nullptr;
2441  // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2442  // // FIXME: This should never take place.
2443  // TBB = FBB = nullptr;
2444  // }
2445  // }
2446  if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond))
2447  PrevBB->updateTerminator();
2448  }
2449 
2450  // Fixup the last block.
2451  Cond.clear();
2452  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2453  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond))
2454  F->back().updateTerminator();
2455 
2456  BlockWorkList.clear();
2457  EHPadWorkList.clear();
2458 }
2459 
2460 void MachineBlockPlacement::optimizeBranches() {
2461  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2462  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2463 
2464  // Now that all the basic blocks in the chain have the proper layout,
2465  // make a final call to AnalyzeBranch with AllowModify set.
2466  // Indeed, the target may be able to optimize the branches in a way we
2467  // cannot because all branches may not be analyzable.
2468  // E.g., the target may be able to remove an unconditional branch to
2469  // a fallthrough when it occurs after predicated terminators.
2470  for (MachineBasicBlock *ChainBB : FunctionChain) {
2471  Cond.clear();
2472  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2473  if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
2474  // If PrevBB has a two-way branch, try to re-order the branches
2475  // such that we branch to the successor with higher probability first.
2476  if (TBB && !Cond.empty() && FBB &&
2477  MBPI->getEdgeProbability(ChainBB, FBB) >
2478  MBPI->getEdgeProbability(ChainBB, TBB) &&
2479  !TII->reverseBranchCondition(Cond)) {
2480  LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2481  << getBlockName(ChainBB) << "\n");
2482  LLVM_DEBUG(dbgs() << " Edge probability: "
2483  << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
2484  << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
2485  DebugLoc dl; // FIXME: this is nowhere
2486  TII->removeBranch(*ChainBB);
2487  TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
2488  ChainBB->updateTerminator();
2489  }
2490  }
2491  }
2492 }
2493 
2494 void MachineBlockPlacement::alignBlocks() {
2495  // Walk through the backedges of the function now that we have fully laid out
2496  // the basic blocks and align the destination of each backedge. We don't rely
2497  // exclusively on the loop info here so that we can align backedges in
2498  // unnatural CFGs and backedges that were introduced purely because of the
2499  // loop rotations done during this layout pass.
2500  if (F->getFunction().optForSize())
2501  return;
2502  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2503  if (FunctionChain.begin() == FunctionChain.end())
2504  return; // Empty chain.
2505 
2506  const BranchProbability ColdProb(1, 5); // 20%
2507  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2508  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2509  for (MachineBasicBlock *ChainBB : FunctionChain) {
2510  if (ChainBB == *FunctionChain.begin())
2511  continue;
2512 
2513  // Don't align non-looping basic blocks. These are unlikely to execute
2514  // enough times to matter in practice. Note that we'll still handle
2515  // unnatural CFGs inside of a natural outer loop (the common case) and
2516  // rotated loops.
2517  MachineLoop *L = MLI->getLoopFor(ChainBB);
2518  if (!L)
2519  continue;
2520 
2521  unsigned Align = TLI->getPrefLoopAlignment(L);
2522  if (!Align)
2523  continue; // Don't care about loop alignment.
2524 
2525  // If the block is cold relative to the function entry don't waste space
2526  // aligning it.
2527  BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
2528  if (Freq < WeightedEntryFreq)
2529  continue;
2530 
2531  // If the block is cold relative to its loop header, don't align it
2532  // regardless of what edges into the block exist.
2533  MachineBasicBlock *LoopHeader = L->getHeader();
2534  BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
2535  if (Freq < (LoopHeaderFreq * ColdProb))
2536  continue;
2537 
2538  // Check for the existence of a non-layout predecessor which would benefit
2539  // from aligning this block.
2540  MachineBasicBlock *LayoutPred =
2541  &*std::prev(MachineFunction::iterator(ChainBB));
2542 
2543  // Force alignment if all the predecessors are jumps. We already checked
2544  // that the block isn't cold above.
2545  if (!LayoutPred->isSuccessor(ChainBB)) {
2546  ChainBB->setAlignment(Align);
2547  continue;
2548  }
2549 
2550  // Align this block if the layout predecessor's edge into this block is
2551  // cold relative to the block. When this is true, other predecessors make up
2552  // all of the hot entries into the block and thus alignment is likely to be
2553  // important.
2554  BranchProbability LayoutProb =
2555  MBPI->getEdgeProbability(LayoutPred, ChainBB);
2556  BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
2557  if (LayoutEdgeFreq <= (Freq * ColdProb))
2558  ChainBB->setAlignment(Align);
2559  }
2560 }
2561 
2562 /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
2563 /// it was duplicated into its chain predecessor and removed.
2564 /// \p BB - Basic block that may be duplicated.
2565 ///
2566 /// \p LPred - Chosen layout predecessor of \p BB.
2567 /// Updated to be the chain end if LPred is removed.
2568 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2569 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2570 /// Used to identify which blocks to update predecessor
2571 /// counts.
2572 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2573 /// chosen in the given order due to unnatural CFG
2574 /// only needed if \p BB is removed and
2575 /// \p PrevUnplacedBlockIt pointed to \p BB.
2576 /// @return true if \p BB was removed.
2577 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
2578  MachineBasicBlock *BB, MachineBasicBlock *&LPred,
2579  const MachineBasicBlock *LoopHeaderBB,
2580  BlockChain &Chain, BlockFilterSet *BlockFilter,
2581  MachineFunction::iterator &PrevUnplacedBlockIt) {
2582  bool Removed, DuplicatedToLPred;
2583  bool DuplicatedToOriginalLPred;
2584  Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
2585  PrevUnplacedBlockIt,
2586  DuplicatedToLPred);
2587  if (!Removed)
2588  return false;
2589  DuplicatedToOriginalLPred = DuplicatedToLPred;
2590  // Iteratively try to duplicate again. It can happen that a block that is
2591  // duplicated into is still small enough to be duplicated again.
2592  // No need to call markBlockSuccessors in this case, as the blocks being
2593  // duplicated from here on are already scheduled.
2594  // Note that DuplicatedToLPred always implies Removed.
2595  while (DuplicatedToLPred) {
2596  assert(Removed && "Block must have been removed to be duplicated into its "
2597  "layout predecessor.");
2598  MachineBasicBlock *DupBB, *DupPred;
2599  // The removal callback causes Chain.end() to be updated when a block is
2600  // removed. On the first pass through the loop, the chain end should be the
2601  // same as it was on function entry. On subsequent passes, because we are
2602  // duplicating the block at the end of the chain, if it is removed the
2603  // chain will have shrunk by one block.
2604  BlockChain::iterator ChainEnd = Chain.end();
2605  DupBB = *(--ChainEnd);
2606  // Now try to duplicate again.
2607  if (ChainEnd == Chain.begin())
2608  break;
2609  DupPred = *std::prev(ChainEnd);
2610  Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
2611  PrevUnplacedBlockIt,
2612  DuplicatedToLPred);
2613  }
2614  // If BB was duplicated into LPred, it is now scheduled. But because it was
2615  // removed, markChainSuccessors won't be called for its chain. Instead we
2616  // call markBlockSuccessors for LPred to achieve the same effect. This must go
2617  // at the end because repeating the tail duplication can increase the number
2618  // of unscheduled predecessors.
2619  LPred = *std::prev(Chain.end());
2620  if (DuplicatedToOriginalLPred)
2621  markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
2622  return true;
2623 }
2624 
2625 /// Tail duplicate \p BB into (some) predecessors if profitable.
2626 /// \p BB - Basic block that may be duplicated
2627 /// \p LPred - Chosen layout predecessor of \p BB
2628 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2629 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2630 /// Used to identify which blocks to update predecessor
2631 /// counts.
2632 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2633 /// chosen in the given order due to unnatural CFG
2634 /// only needed if \p BB is removed and
2635 /// \p PrevUnplacedBlockIt pointed to \p BB.
2636 /// \p DuplicatedToLPred - True if the block was duplicated into LPred. Will
2637 /// only be true if the block was removed.
2638 /// \return - True if the block was duplicated into all preds and removed.
2639 bool MachineBlockPlacement::maybeTailDuplicateBlock(
2641  BlockChain &Chain, BlockFilterSet *BlockFilter,
2642  MachineFunction::iterator &PrevUnplacedBlockIt,
2643  bool &DuplicatedToLPred) {
2644  DuplicatedToLPred = false;
2645  if (!shouldTailDuplicate(BB))
2646  return false;
2647 
2648  LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
2649  << "\n");
2650 
2651  // This has to be a callback because none of it can be done after
2652  // BB is deleted.
2653  bool Removed = false;
2654  auto RemovalCallback =
2655  [&](MachineBasicBlock *RemBB) {
2656  // Signal to outer function
2657  Removed = true;
2658 
2659  // Conservative default.
2660  bool InWorkList = true;
2661  // Remove from the Chain and Chain Map
2662  if (BlockToChain.count(RemBB)) {
2663  BlockChain *Chain = BlockToChain[RemBB];
2664  InWorkList = Chain->UnscheduledPredecessors == 0;
2665  Chain->remove(RemBB);
2666  BlockToChain.erase(RemBB);
2667  }
2668 
2669  // Handle the unplaced block iterator
2670  if (&(*PrevUnplacedBlockIt) == RemBB) {
2671  PrevUnplacedBlockIt++;
2672  }
2673 
2674  // Handle the Work Lists
2675  if (InWorkList) {
2676  SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
2677  if (RemBB->isEHPad())
2678  RemoveList = EHPadWorkList;
2679  RemoveList.erase(
2680  llvm::remove_if(RemoveList,
2681  [RemBB](MachineBasicBlock *BB) {
2682  return BB == RemBB;
2683  }),
2684  RemoveList.end());
2685  }
2686 
2687  // Handle the filter set
2688  if (BlockFilter) {
2689  BlockFilter->remove(RemBB);
2690  }
2691 
2692  // Remove the block from loop info.
2693  MLI->removeBlock(RemBB);
2694  if (RemBB == PreferredLoopExit)
2695  PreferredLoopExit = nullptr;
2696 
2697  LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
2698  << getBlockName(RemBB) << "\n");
2699  };
2700  auto RemovalCallbackRef =
2701  function_ref<void(MachineBasicBlock*)>(RemovalCallback);
2702 
2703  SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
2704  bool IsSimple = TailDup.isSimpleBB(BB);
2705  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
2706  &DuplicatedPreds, &RemovalCallbackRef);
2707 
2708  // Update UnscheduledPredecessors to reflect tail-duplication.
2709  DuplicatedToLPred = false;
2710  for (MachineBasicBlock *Pred : DuplicatedPreds) {
2711  // We're only looking for unscheduled predecessors that match the filter.
2712  BlockChain* PredChain = BlockToChain[Pred];
2713  if (Pred == LPred)
2714  DuplicatedToLPred = true;
2715  if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
2716  || PredChain == &Chain)
2717  continue;
2718  for (MachineBasicBlock *NewSucc : Pred->successors()) {
2719  if (BlockFilter && !BlockFilter->count(NewSucc))
2720  continue;
2721  BlockChain *NewChain = BlockToChain[NewSucc];
2722  if (NewChain != &Chain && NewChain != PredChain)
2723  NewChain->UnscheduledPredecessors++;
2724  }
2725  }
2726  return Removed;
2727 }
2728 
2729 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
2730  if (skipFunction(MF.getFunction()))
2731  return false;
2732 
2733  // Check for single-block functions and skip them.
2734  if (std::next(MF.begin()) == MF.end())
2735  return false;
2736 
2737  F = &MF;
2738  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
2739  MBFI = llvm::make_unique<BranchFolder::MBFIWrapper>(
2740  getAnalysis<MachineBlockFrequencyInfo>());
2741  MLI = &getAnalysis<MachineLoopInfo>();
2742  TII = MF.getSubtarget().getInstrInfo();
2743  TLI = MF.getSubtarget().getTargetLowering();
2744  MPDT = nullptr;
2745 
2746  // Initialize PreferredLoopExit to nullptr here since it may never be set if
2747  // there are no MachineLoops.
2748  PreferredLoopExit = nullptr;
2749 
2750  assert(BlockToChain.empty() &&
2751  "BlockToChain map should be empty before starting placement.");
2752  assert(ComputedEdges.empty() &&
2753  "Computed Edge map should be empty before starting placement.");
2754 
2755  unsigned TailDupSize = TailDupPlacementThreshold;
2756  // If only the aggressive threshold is explicitly set, use it.
2757  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
2758  TailDupPlacementThreshold.getNumOccurrences() == 0)
2760 
2761  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
2762  // For aggressive optimization, we can adjust some thresholds to be less
2763  // conservative.
2764  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
2765  // At O3 we should be more willing to copy blocks for tail duplication. This
2766  // increases size pressure, so we only do it at O3
2767  // Do this unless only the regular threshold is explicitly set.
2768  if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
2769  TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
2771  }
2772 
2773  if (allowTailDupPlacement()) {
2774  MPDT = &getAnalysis<MachinePostDominatorTree>();
2775  if (MF.getFunction().optForSize())
2776  TailDupSize = 1;
2777  bool PreRegAlloc = false;
2778  TailDup.initMF(MF, PreRegAlloc, MBPI, /* LayoutMode */ true, TailDupSize);
2779  precomputeTriangleChains();
2780  }
2781 
2782  buildCFGChains();
2783 
2784  // Changing the layout can create new tail merging opportunities.
2785  // TailMerge can create jump into if branches that make CFG irreducible for
2786  // HW that requires structured CFG.
2787  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
2788  PassConfig->getEnableTailMerge() &&
2790  // No tail merging opportunities if the block number is less than four.
2791  if (MF.size() > 3 && EnableTailMerge) {
2792  unsigned TailMergeSize = TailDupSize + 1;
2793  BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
2794  *MBPI, TailMergeSize);
2795 
2796  if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
2797  getAnalysisIfAvailable<MachineModuleInfo>(), MLI,
2798  /*AfterBlockPlacement=*/true)) {
2799  // Redo the layout if tail merging creates/removes/moves blocks.
2800  BlockToChain.clear();
2801  ComputedEdges.clear();
2802  // Must redo the post-dominator tree if blocks were changed.
2803  if (MPDT)
2804  MPDT->runOnMachineFunction(MF);
2805  ChainAllocator.DestroyAll();
2806  buildCFGChains();
2807  }
2808  }
2809 
2810  optimizeBranches();
2811  alignBlocks();
2812 
2813  BlockToChain.clear();
2814  ComputedEdges.clear();
2815  ChainAllocator.DestroyAll();
2816 
2817  if (AlignAllBlock)
2818  // Align all of the blocks in the function to a specific alignment.
2819  for (MachineBasicBlock &MBB : MF)
2821  else if (AlignAllNonFallThruBlocks) {
2822  // Align all of the blocks that have no fall-through predecessors to a
2823  // specific alignment.
2824  for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
2825  auto LayoutPred = std::prev(MBI);
2826  if (!LayoutPred->isSuccessor(&*MBI))
2827  MBI->setAlignment(AlignAllNonFallThruBlocks);
2828  }
2829  }
2830  if (ViewBlockLayoutWithBFI != GVDT_None &&
2831  (ViewBlockFreqFuncName.empty() ||
2832  F->getFunction().getName().equals(ViewBlockFreqFuncName))) {
2833  MBFI->view("MBP." + MF.getName(), false);
2834  }
2835 
2836 
2837  // We always return true as we have no way to track whether the final order
2838  // differs from the original order.
2839  return true;
2840 }
2841 
2842 namespace {
2843 
2844 /// A pass to compute block placement statistics.
2845 ///
2846 /// A separate pass to compute interesting statistics for evaluating block
2847 /// placement. This is separate from the actual placement pass so that they can
2848 /// be computed in the absence of any placement transformations or when using
2849 /// alternative placement strategies.
2850 class MachineBlockPlacementStats : public MachineFunctionPass {
2851  /// A handle to the branch probability pass.
2852  const MachineBranchProbabilityInfo *MBPI;
2853 
2854  /// A handle to the function-wide block frequency pass.
2855  const MachineBlockFrequencyInfo *MBFI;
2856 
2857 public:
2858  static char ID; // Pass identification, replacement for typeid
2859 
2860  MachineBlockPlacementStats() : MachineFunctionPass(ID) {
2862  }
2863 
2864  bool runOnMachineFunction(MachineFunction &F) override;
2865 
2866  void getAnalysisUsage(AnalysisUsage &AU) const override {
2869  AU.setPreservesAll();
2871  }
2872 };
2873 
2874 } // end anonymous namespace
2875 
2877 
2879 
2880 INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
2881  "Basic Block Placement Stats", false, false)
2884 INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
2885  "Basic Block Placement Stats", false, false)
2886 
2887 bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
2888  // Check for single-block functions and skip them.
2889  if (std::next(F.begin()) == F.end())
2890  return false;
2891 
2892  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
2893  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
2894 
2895  for (MachineBasicBlock &MBB : F) {
2896  BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
2897  Statistic &NumBranches =
2898  (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
2899  Statistic &BranchTakenFreq =
2900  (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
2901  for (MachineBasicBlock *Succ : MBB.successors()) {
2902  // Skip if this successor is a fallthrough.
2903  if (MBB.isLayoutSuccessor(Succ))
2904  continue;
2905 
2906  BlockFrequency EdgeFreq =
2907  BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
2908  ++NumBranches;
2909  BranchTakenFreq += EdgeFreq.getFrequency();
2910  }
2911  }
2912 
2913  return false;
2914 }
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:259
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
BranchProbability getCompl() const
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:250
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:328
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
unsigned size() const
bool requiresStructuredCFG() const
unsigned succ_size(const BasicBlock *BB)
Definition: CFG.h:146
virtual const TargetLowering * getTargetLowering() const
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
virtual unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const
Insert branch code into the end of the specified MachineBasicBlock.
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLExtras.h:107
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
STATISTIC(NumFunctions, "Total number of functions")
A debug info location.
Definition: DebugLoc.h:34
F(f)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:452
static BranchProbability getOne()
virtual unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const
Remove the branching code at the end of the specific MBB.
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
void setAlignment(unsigned Align)
Set alignment of the basic block.
CodeGenOpt::Level getOptLevel() const
This file defines the MallocAllocator and BumpPtrAllocator interfaces.
iterator_range< succ_iterator > successors()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:344
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:191
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:480
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, const TargetRegisterInfo *tri, MachineModuleInfo *mmi, MachineLoopInfo *mli=nullptr, bool AfterPlacement=false)
Perhaps branch folding, tail merging and other CFG optimizations on the given function.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
void removeBlock(MachineBasicBlock *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
block placement Basic Block Placement Stats
Target-Independent Code Generator Pass Configuration Options.
BlockT * getHeader() const
Definition: LoopInfo.h:100
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:251
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:408
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
virtual const TargetInstrInfo * getInstrInfo() const
cl::opt< std::string > ViewBlockFreqFuncName
auto count(R &&Range, const E &Element) -> typename std::iterator_traits< decltype(adl_begin(Range))>::difference_type
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1084
TargetInstrInfo - Interface to description of machine instruction set.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
cl::opt< unsigned > ProfileLikelyProb
bool getEnableTailMerge() const
#define P(N)
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock *> *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:410
cl::opt< GVDAGType > ViewBlockLayoutWithBFI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:291
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:92
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:129
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
virtual unsigned getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Represent the analysis usage information of a pass.
bool optForSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:584
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, uint64_t EntryFreq)
Compare 2 BlockFrequency&#39;s with a small penalty for A.
iterator_range< pred_iterator > predecessors()
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:436
auto remove_if(R &&Range, UnaryPredicate P) -> decltype(adl_begin(Range))
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1058
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e...
iterator erase(const_iterator CI)
Definition: SmallVector.h:445
const MachineBasicBlock & front() const
size_t size() const
Definition: SmallVector.h:53
void initializeMachineBlockPlacementPass(PassRegistry &)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1039
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
Branch Probability Basic Block Placement
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
size_type size() const
Definition: SmallPtrSet.h:93
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:110
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:298
Iterator for intrusive lists based on ilist_node.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:418
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
loop rotate
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
void updateTerminator()
Update the terminator instructions in block to account for changes to the layout. ...
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_END(MachineBlockPlacement
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:387
unsigned pred_size() const
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:924
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
void setPreservesAll()
Set by analyses that do not transform their input at all.
typename SuperClass::iterator iterator
Definition: SmallVector.h:327
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:169
unsigned succ_size() const
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
cl::opt< unsigned > StaticLikelyProb
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:163
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all " "blocks in the function."), cl::init(0), cl::Hidden)
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Definition: SmallVector.h:133
static uint64_t getMaxFrequency()
Returns the maximum possible frequency, the saturation value.
void emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:652
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:56
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:149
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:224
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock *> &Successors)
Check if BB has exactly the successors in Successors.
#define I(x, y, z)
Definition: MD5.cpp:58
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
block_iterator block_end() const
Definition: LoopInfo.h:155
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:141
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:181
virtual bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const
Reverses the branch condition of the specified condition list, returning false on success and true if...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:477
#define DEBUG_TYPE
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
static BranchProbability getZero()
Utility class to perform tail duplication.
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
#define LLVM_DEBUG(X)
Definition: Debug.h:119
for(unsigned i=Desc.getNumOperands(), e=OldMI.getNumOperands();i !=e;++i)
block_iterator block_begin() const
Definition: LoopInfo.h:154
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all " "blocks that have no fall-through predecessors (i.e. don't add " "nops that are executed)."), cl::init(0), cl::Hidden)
bool hasProfileData() const
Return true if the function is annotated with profile data.
Definition: Function.h:294
uint32_t getNumerator() const
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
This file describes how to lower LLVM code to machine code.