LLVM  6.0.0svn
MachineBlockPlacement.cpp
Go to the documentation of this file.
1 //===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements basic block placement transformations using the CFG
11 // structure and branch probability estimates.
12 //
13 // The pass strives to preserve the structure of the CFG (that is, retain
14 // a topological ordering of basic blocks) in the absence of a *strong* signal
15 // to the contrary from probabilities. However, within the CFG structure, it
16 // attempts to choose an ordering which favors placing more likely sequences of
17 // blocks adjacent to each other.
18 //
19 // The algorithm works from the inner-most loop within a function outward, and
20 // at each stage walks through the basic blocks, trying to coalesce them into
21 // sequential chains where allowed by the CFG (or demanded by heavy
22 // probabilities). Finally, it walks the blocks in topological order, and the
23 // first time it reaches a chain of basic blocks, it schedules them in the
24 // function in-order.
25 //
26 //===----------------------------------------------------------------------===//
27 
28 #include "BranchFolding.h"
29 #include "llvm/ADT/ArrayRef.h"
30 #include "llvm/ADT/DenseMap.h"
31 #include "llvm/ADT/STLExtras.h"
32 #include "llvm/ADT/SetVector.h"
33 #include "llvm/ADT/SmallPtrSet.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/Statistic.h"
50 #include "llvm/IR/DebugLoc.h"
51 #include "llvm/IR/Function.h"
52 #include "llvm/Pass.h"
53 #include "llvm/Support/Allocator.h"
56 #include "llvm/Support/CodeGen.h"
58 #include "llvm/Support/Compiler.h"
59 #include "llvm/Support/Debug.h"
62 #include <algorithm>
63 #include <cassert>
64 #include <cstdint>
65 #include <iterator>
66 #include <memory>
67 #include <string>
68 #include <tuple>
69 #include <utility>
70 #include <vector>
71 
72 using namespace llvm;
73 
74 #define DEBUG_TYPE "block-placement"
75 
76 STATISTIC(NumCondBranches, "Number of conditional branches");
77 STATISTIC(NumUncondBranches, "Number of unconditional branches");
78 STATISTIC(CondBranchTakenFreq,
79  "Potential frequency of taking conditional branches");
80 STATISTIC(UncondBranchTakenFreq,
81  "Potential frequency of taking unconditional branches");
82 
83 static cl::opt<unsigned> AlignAllBlock("align-all-blocks",
84  cl::desc("Force the alignment of all "
85  "blocks in the function."),
86  cl::init(0), cl::Hidden);
87 
89  "align-all-nofallthru-blocks",
90  cl::desc("Force the alignment of all "
91  "blocks that have no fall-through predecessors (i.e. don't add "
92  "nops that are executed)."),
93  cl::init(0), cl::Hidden);
94 
95 // FIXME: Find a good default for this flag and remove the flag.
97  "block-placement-exit-block-bias",
98  cl::desc("Block frequency percentage a loop exit block needs "
99  "over the original exit to be considered the new exit."),
100  cl::init(0), cl::Hidden);
101 
102 // Definition:
103 // - Outlining: placement of a basic block outside the chain or hot path.
104 
106  "loop-to-cold-block-ratio",
107  cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
108  "(frequency of block) is greater than this ratio"),
109  cl::init(5), cl::Hidden);
110 
112  "force-loop-cold-block",
113  cl::desc("Force outlining cold blocks from loops."),
114  cl::init(false), cl::Hidden);
115 
116 static cl::opt<bool>
117  PreciseRotationCost("precise-rotation-cost",
118  cl::desc("Model the cost of loop rotation more "
119  "precisely by using profile data."),
120  cl::init(false), cl::Hidden);
121 
122 static cl::opt<bool>
123  ForcePreciseRotationCost("force-precise-rotation-cost",
124  cl::desc("Force the use of precise cost "
125  "loop rotation strategy."),
126  cl::init(false), cl::Hidden);
127 
129  "misfetch-cost",
130  cl::desc("Cost that models the probabilistic risk of an instruction "
131  "misfetch due to a jump comparing to falling through, whose cost "
132  "is zero."),
133  cl::init(1), cl::Hidden);
134 
135 static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
136  cl::desc("Cost of jump instructions."),
137  cl::init(1), cl::Hidden);
138 static cl::opt<bool>
139 TailDupPlacement("tail-dup-placement",
140  cl::desc("Perform tail duplication during placement. "
141  "Creates more fallthrough opportunites in "
142  "outline branches."),
143  cl::init(true), cl::Hidden);
144 
145 static cl::opt<bool>
146 BranchFoldPlacement("branch-fold-placement",
147  cl::desc("Perform branch folding during placement. "
148  "Reduces code size."),
149  cl::init(true), cl::Hidden);
150 
151 // Heuristic for tail duplication.
153  "tail-dup-placement-threshold",
154  cl::desc("Instruction cutoff for tail duplication during layout. "
155  "Tail merging during layout is forced to have a threshold "
156  "that won't conflict."), cl::init(2),
157  cl::Hidden);
158 
159 // Heuristic for aggressive tail duplication.
161  "tail-dup-placement-aggressive-threshold",
162  cl::desc("Instruction cutoff for aggressive tail duplication during "
163  "layout. Used at -O3. Tail merging during layout is forced to "
164  "have a threshold that won't conflict."), cl::init(4),
165  cl::Hidden);
166 
167 // Heuristic for tail duplication.
169  "tail-dup-placement-penalty",
170  cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
171  "Copying can increase fallthrough, but it also increases icache "
172  "pressure. This parameter controls the penalty to account for that. "
173  "Percent as integer."),
174  cl::init(2),
175  cl::Hidden);
176 
177 // Heuristic for triangle chains.
179  "triangle-chain-count",
180  cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
181  "triangle tail duplication heuristic to kick in. 0 to disable."),
182  cl::init(2),
183  cl::Hidden);
184 
187 
188 // Internal option used to control BFI display only after MBP pass.
189 // Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
190 // -view-block-layout-with-bfi=
192 
193 // Command line option to specify the name of the function for CFG dump
194 // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
196 
197 namespace {
198 
199 class BlockChain;
200 
201 /// \brief Type for our function-wide basic block -> block chain mapping.
202 using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
203 
204 /// \brief A chain of blocks which will be laid out contiguously.
205 ///
206 /// This is the datastructure representing a chain of consecutive blocks that
207 /// are profitable to layout together in order to maximize fallthrough
208 /// probabilities and code locality. We also can use a block chain to represent
209 /// a sequence of basic blocks which have some external (correctness)
210 /// requirement for sequential layout.
211 ///
212 /// Chains can be built around a single basic block and can be merged to grow
213 /// them. They participate in a block-to-chain mapping, which is updated
214 /// automatically as chains are merged together.
215 class BlockChain {
216  /// \brief The sequence of blocks belonging to this chain.
217  ///
218  /// This is the sequence of blocks for a particular chain. These will be laid
219  /// out in-order within the function.
221 
222  /// \brief A handle to the function-wide basic block to block chain mapping.
223  ///
224  /// This is retained in each block chain to simplify the computation of child
225  /// block chains for SCC-formation and iteration. We store the edges to child
226  /// basic blocks, and map them back to their associated chains using this
227  /// structure.
228  BlockToChainMapType &BlockToChain;
229 
230 public:
231  /// \brief Construct a new BlockChain.
232  ///
233  /// This builds a new block chain representing a single basic block in the
234  /// function. It also registers itself as the chain that block participates
235  /// in with the BlockToChain mapping.
236  BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
237  : Blocks(1, BB), BlockToChain(BlockToChain) {
238  assert(BB && "Cannot create a chain with a null basic block");
239  BlockToChain[BB] = this;
240  }
241 
242  /// \brief Iterator over blocks within the chain.
245 
246  /// \brief Beginning of blocks within the chain.
247  iterator begin() { return Blocks.begin(); }
248  const_iterator begin() const { return Blocks.begin(); }
249 
250  /// \brief End of blocks within the chain.
251  iterator end() { return Blocks.end(); }
252  const_iterator end() const { return Blocks.end(); }
253 
254  bool remove(MachineBasicBlock* BB) {
255  for(iterator i = begin(); i != end(); ++i) {
256  if (*i == BB) {
257  Blocks.erase(i);
258  return true;
259  }
260  }
261  return false;
262  }
263 
264  /// \brief Merge a block chain into this one.
265  ///
266  /// This routine merges a block chain into this one. It takes care of forming
267  /// a contiguous sequence of basic blocks, updating the edge list, and
268  /// updating the block -> chain mapping. It does not free or tear down the
269  /// old chain, but the old chain's block list is no longer valid.
270  void merge(MachineBasicBlock *BB, BlockChain *Chain) {
271  assert(BB && "Can't merge a null block.");
272  assert(!Blocks.empty() && "Can't merge into an empty chain.");
273 
274  // Fast path in case we don't have a chain already.
275  if (!Chain) {
276  assert(!BlockToChain[BB] &&
277  "Passed chain is null, but BB has entry in BlockToChain.");
278  Blocks.push_back(BB);
279  BlockToChain[BB] = this;
280  return;
281  }
282 
283  assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
284  assert(Chain->begin() != Chain->end());
285 
286  // Update the incoming blocks to point to this chain, and add them to the
287  // chain structure.
288  for (MachineBasicBlock *ChainBB : *Chain) {
289  Blocks.push_back(ChainBB);
290  assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
291  BlockToChain[ChainBB] = this;
292  }
293  }
294 
295 #ifndef NDEBUG
296  /// \brief Dump the blocks in this chain.
297  LLVM_DUMP_METHOD void dump() {
298  for (MachineBasicBlock *MBB : *this)
299  MBB->dump();
300  }
301 #endif // NDEBUG
302 
303  /// \brief Count of predecessors of any block within the chain which have not
304  /// yet been scheduled. In general, we will delay scheduling this chain
305  /// until those predecessors are scheduled (or we find a sufficiently good
306  /// reason to override this heuristic.) Note that when forming loop chains,
307  /// blocks outside the loop are ignored and treated as if they were already
308  /// scheduled.
309  ///
310  /// Note: This field is reinitialized multiple times - once for each loop,
311  /// and then once for the function as a whole.
312  unsigned UnscheduledPredecessors = 0;
313 };
314 
315 class MachineBlockPlacement : public MachineFunctionPass {
316  /// \brief A type for a block filter set.
317  using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
318 
319  /// Pair struct containing basic block and taildup profitiability
320  struct BlockAndTailDupResult {
321  MachineBasicBlock *BB;
322  bool ShouldTailDup;
323  };
324 
325  /// Triple struct containing edge weight and the edge.
326  struct WeightedEdge {
327  BlockFrequency Weight;
328  MachineBasicBlock *Src;
329  MachineBasicBlock *Dest;
330  };
331 
332  /// \brief work lists of blocks that are ready to be laid out
335 
336  /// Edges that have already been computed as optimal.
338 
339  /// \brief Machine Function
341 
342  /// \brief A handle to the branch probability pass.
343  const MachineBranchProbabilityInfo *MBPI;
344 
345  /// \brief A handle to the function-wide block frequency pass.
346  std::unique_ptr<BranchFolder::MBFIWrapper> MBFI;
347 
348  /// \brief A handle to the loop info.
349  MachineLoopInfo *MLI;
350 
351  /// \brief Preferred loop exit.
352  /// Member variable for convenience. It may be removed by duplication deep
353  /// in the call stack.
354  MachineBasicBlock *PreferredLoopExit;
355 
356  /// \brief A handle to the target's instruction info.
357  const TargetInstrInfo *TII;
358 
359  /// \brief A handle to the target's lowering info.
360  const TargetLoweringBase *TLI;
361 
362  /// \brief A handle to the post dominator tree.
364 
365  /// \brief Duplicator used to duplicate tails during placement.
366  ///
367  /// Placement decisions can open up new tail duplication opportunities, but
368  /// since tail duplication affects placement decisions of later blocks, it
369  /// must be done inline.
370  TailDuplicator TailDup;
371 
372  /// \brief Allocator and owner of BlockChain structures.
373  ///
374  /// We build BlockChains lazily while processing the loop structure of
375  /// a function. To reduce malloc traffic, we allocate them using this
376  /// slab-like allocator, and destroy them after the pass completes. An
377  /// important guarantee is that this allocator produces stable pointers to
378  /// the chains.
380 
381  /// \brief Function wide BasicBlock to BlockChain mapping.
382  ///
383  /// This mapping allows efficiently moving from any given basic block to the
384  /// BlockChain it participates in, if any. We use it to, among other things,
385  /// allow implicitly defining edges between chains as the existing edges
386  /// between basic blocks.
388 
389 #ifndef NDEBUG
390  /// The set of basic blocks that have terminators that cannot be fully
391  /// analyzed. These basic blocks cannot be re-ordered safely by
392  /// MachineBlockPlacement, and we must preserve physical layout of these
393  /// blocks and their successors through the pass.
394  SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
395 #endif
396 
397  /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
398  /// if the count goes to 0, add them to the appropriate work list.
399  void markChainSuccessors(
400  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
401  const BlockFilterSet *BlockFilter = nullptr);
402 
403  /// Decrease the UnscheduledPredecessors count for a single block, and
404  /// if the count goes to 0, add them to the appropriate work list.
405  void markBlockSuccessors(
406  const BlockChain &Chain, const MachineBasicBlock *BB,
407  const MachineBasicBlock *LoopHeaderBB,
408  const BlockFilterSet *BlockFilter = nullptr);
409 
411  collectViableSuccessors(
412  const MachineBasicBlock *BB, const BlockChain &Chain,
413  const BlockFilterSet *BlockFilter,
415  bool shouldPredBlockBeOutlined(
416  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
417  const BlockChain &Chain, const BlockFilterSet *BlockFilter,
418  BranchProbability SuccProb, BranchProbability HotProb);
419  bool repeatedlyTailDuplicateBlock(
421  const MachineBasicBlock *LoopHeaderBB,
422  BlockChain &Chain, BlockFilterSet *BlockFilter,
423  MachineFunction::iterator &PrevUnplacedBlockIt);
424  bool maybeTailDuplicateBlock(
426  BlockChain &Chain, BlockFilterSet *BlockFilter,
427  MachineFunction::iterator &PrevUnplacedBlockIt,
428  bool &DuplicatedToPred);
429  bool hasBetterLayoutPredecessor(
430  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
431  const BlockChain &SuccChain, BranchProbability SuccProb,
432  BranchProbability RealSuccProb, const BlockChain &Chain,
433  const BlockFilterSet *BlockFilter);
434  BlockAndTailDupResult selectBestSuccessor(
435  const MachineBasicBlock *BB, const BlockChain &Chain,
436  const BlockFilterSet *BlockFilter);
437  MachineBasicBlock *selectBestCandidateBlock(
438  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
439  MachineBasicBlock *getFirstUnplacedBlock(
440  const BlockChain &PlacedChain,
441  MachineFunction::iterator &PrevUnplacedBlockIt,
442  const BlockFilterSet *BlockFilter);
443 
444  /// \brief Add a basic block to the work list if it is appropriate.
445  ///
446  /// If the optional parameter BlockFilter is provided, only MBB
447  /// present in the set will be added to the worklist. If nullptr
448  /// is provided, no filtering occurs.
449  void fillWorkLists(const MachineBasicBlock *MBB,
450  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
451  const BlockFilterSet *BlockFilter);
452 
453  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
454  BlockFilterSet *BlockFilter = nullptr);
455  MachineBasicBlock *findBestLoopTop(
456  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
457  MachineBasicBlock *findBestLoopExit(
458  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
459  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
460  void buildLoopChains(const MachineLoop &L);
461  void rotateLoop(
462  BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
463  const BlockFilterSet &LoopBlockSet);
464  void rotateLoopWithProfile(
465  BlockChain &LoopChain, const MachineLoop &L,
466  const BlockFilterSet &LoopBlockSet);
467  void buildCFGChains();
468  void optimizeBranches();
469  void alignBlocks();
470  /// Returns true if a block should be tail-duplicated to increase fallthrough
471  /// opportunities.
472  bool shouldTailDuplicate(MachineBasicBlock *BB);
473  /// Check the edge frequencies to see if tail duplication will increase
474  /// fallthroughs.
475  bool isProfitableToTailDup(
476  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
477  BranchProbability AdjustedSumProb,
478  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
479 
480  /// Check for a trellis layout.
481  bool isTrellis(const MachineBasicBlock *BB,
482  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
483  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
484 
485  /// Get the best successor given a trellis layout.
486  BlockAndTailDupResult getBestTrellisSuccessor(
487  const MachineBasicBlock *BB,
488  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
489  BranchProbability AdjustedSumProb, const BlockChain &Chain,
490  const BlockFilterSet *BlockFilter);
491 
492  /// Get the best pair of non-conflicting edges.
493  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
494  const MachineBasicBlock *BB,
496 
497  /// Returns true if a block can tail duplicate into all unplaced
498  /// predecessors. Filters based on loop.
499  bool canTailDuplicateUnplacedPreds(
500  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
501  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
502 
503  /// Find chains of triangles to tail-duplicate where a global analysis works,
504  /// but a local analysis would not find them.
505  void precomputeTriangleChains();
506 
507 public:
508  static char ID; // Pass identification, replacement for typeid
509 
510  MachineBlockPlacement() : MachineFunctionPass(ID) {
512  }
513 
514  bool runOnMachineFunction(MachineFunction &F) override;
515 
516  void getAnalysisUsage(AnalysisUsage &AU) const override {
519  if (TailDupPlacement)
524  }
525 };
526 
527 } // end anonymous namespace
528 
530 
532 
533 INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
534  "Branch Probability Basic Block Placement", false, false)
539 INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
540  "Branch Probability Basic Block Placement", false, false)
541 
542 #ifndef NDEBUG
543 /// \brief Helper to print the name of a MBB.
544 ///
545 /// Only used by debug logging.
546 static std::string getBlockName(const MachineBasicBlock *BB) {
547  std::string Result;
548  raw_string_ostream OS(Result);
549  OS << "BB#" << BB->getNumber();
550  OS << " ('" << BB->getName() << "')";
551  OS.flush();
552  return Result;
553 }
554 #endif
555 
556 /// \brief Mark a chain's successors as having one fewer preds.
557 ///
558 /// When a chain is being merged into the "placed" chain, this routine will
559 /// quickly walk the successors of each block in the chain and mark them as
560 /// having one fewer active predecessor. It also adds any successors of this
561 /// chain which reach the zero-predecessor state to the appropriate worklist.
562 void MachineBlockPlacement::markChainSuccessors(
563  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
564  const BlockFilterSet *BlockFilter) {
565  // Walk all the blocks in this chain, marking their successors as having
566  // a predecessor placed.
567  for (MachineBasicBlock *MBB : Chain) {
568  markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
569  }
570 }
571 
572 /// \brief Mark a single block's successors as having one fewer preds.
573 ///
574 /// Under normal circumstances, this is only called by markChainSuccessors,
575 /// but if a block that was to be placed is completely tail-duplicated away,
576 /// and was duplicated into the chain end, we need to redo markBlockSuccessors
577 /// for just that block.
578 void MachineBlockPlacement::markBlockSuccessors(
579  const BlockChain &Chain, const MachineBasicBlock *MBB,
580  const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
581  // Add any successors for which this is the only un-placed in-loop
582  // predecessor to the worklist as a viable candidate for CFG-neutral
583  // placement. No subsequent placement of this block will violate the CFG
584  // shape, so we get to use heuristics to choose a favorable placement.
585  for (MachineBasicBlock *Succ : MBB->successors()) {
586  if (BlockFilter && !BlockFilter->count(Succ))
587  continue;
588  BlockChain &SuccChain = *BlockToChain[Succ];
589  // Disregard edges within a fixed chain, or edges to the loop header.
590  if (&Chain == &SuccChain || Succ == LoopHeaderBB)
591  continue;
592 
593  // This is a cross-chain edge that is within the loop, so decrement the
594  // loop predecessor count of the destination chain.
595  if (SuccChain.UnscheduledPredecessors == 0 ||
596  --SuccChain.UnscheduledPredecessors > 0)
597  continue;
598 
599  auto *NewBB = *SuccChain.begin();
600  if (NewBB->isEHPad())
601  EHPadWorkList.push_back(NewBB);
602  else
603  BlockWorkList.push_back(NewBB);
604  }
605 }
606 
607 /// This helper function collects the set of successors of block
608 /// \p BB that are allowed to be its layout successors, and return
609 /// the total branch probability of edges from \p BB to those
610 /// blocks.
611 BranchProbability MachineBlockPlacement::collectViableSuccessors(
612  const MachineBasicBlock *BB, const BlockChain &Chain,
613  const BlockFilterSet *BlockFilter,
615  // Adjust edge probabilities by excluding edges pointing to blocks that is
616  // either not in BlockFilter or is already in the current chain. Consider the
617  // following CFG:
618  //
619  // --->A
620  // | / \
621  // | B C
622  // | \ / \
623  // ----D E
624  //
625  // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
626  // A->C is chosen as a fall-through, D won't be selected as a successor of C
627  // due to CFG constraint (the probability of C->D is not greater than
628  // HotProb to break topo-order). If we exclude E that is not in BlockFilter
629  // when calculating the probability of C->D, D will be selected and we
630  // will get A C D B as the layout of this loop.
631  auto AdjustedSumProb = BranchProbability::getOne();
632  for (MachineBasicBlock *Succ : BB->successors()) {
633  bool SkipSucc = false;
634  if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
635  SkipSucc = true;
636  } else {
637  BlockChain *SuccChain = BlockToChain[Succ];
638  if (SuccChain == &Chain) {
639  SkipSucc = true;
640  } else if (Succ != *SuccChain->begin()) {
641  DEBUG(dbgs() << " " << getBlockName(Succ) << " -> Mid chain!\n");
642  continue;
643  }
644  }
645  if (SkipSucc)
646  AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
647  else
648  Successors.push_back(Succ);
649  }
650 
651  return AdjustedSumProb;
652 }
653 
654 /// The helper function returns the branch probability that is adjusted
655 /// or normalized over the new total \p AdjustedSumProb.
656 static BranchProbability
658  BranchProbability AdjustedSumProb) {
659  BranchProbability SuccProb;
660  uint32_t SuccProbN = OrigProb.getNumerator();
661  uint32_t SuccProbD = AdjustedSumProb.getNumerator();
662  if (SuccProbN >= SuccProbD)
663  SuccProb = BranchProbability::getOne();
664  else
665  SuccProb = BranchProbability(SuccProbN, SuccProbD);
666 
667  return SuccProb;
668 }
669 
670 /// Check if \p BB has exactly the successors in \p Successors.
671 static bool
674  if (BB.succ_size() != Successors.size())
675  return false;
676  // We don't want to count self-loops
677  if (Successors.count(&BB))
678  return false;
679  for (MachineBasicBlock *Succ : BB.successors())
680  if (!Successors.count(Succ))
681  return false;
682  return true;
683 }
684 
685 /// Check if a block should be tail duplicated to increase fallthrough
686 /// opportunities.
687 /// \p BB Block to check.
688 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
689  // Blocks with single successors don't create additional fallthrough
690  // opportunities. Don't duplicate them. TODO: When conditional exits are
691  // analyzable, allow them to be duplicated.
692  bool IsSimple = TailDup.isSimpleBB(BB);
693 
694  if (BB->succ_size() == 1)
695  return false;
696  return TailDup.shouldTailDuplicate(IsSimple, *BB);
697 }
698 
699 /// Compare 2 BlockFrequency's with a small penalty for \p A.
700 /// In order to be conservative, we apply a X% penalty to account for
701 /// increased icache pressure and static heuristics. For small frequencies
702 /// we use only the numerators to improve accuracy. For simplicity, we assume the
703 /// penalty is less than 100%
704 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
706  uint64_t EntryFreq) {
707  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
708  BlockFrequency Gain = A - B;
709  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
710 }
711 
712 /// Check the edge frequencies to see if tail duplication will increase
713 /// fallthroughs. It only makes sense to call this function when
714 /// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
715 /// always locally profitable if we would have picked \p Succ without
716 /// considering duplication.
717 bool MachineBlockPlacement::isProfitableToTailDup(
718  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
719  BranchProbability QProb,
720  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
721  // We need to do a probability calculation to make sure this is profitable.
722  // First: does succ have a successor that post-dominates? This affects the
723  // calculation. The 2 relevant cases are:
724  // BB BB
725  // | \Qout | \Qout
726  // P| C |P C
727  // = C' = C'
728  // | /Qin | /Qin
729  // | / | /
730  // Succ Succ
731  // / \ | \ V
732  // U/ =V |U \
733  // / \ = D
734  // D E | /
735  // | /
736  // |/
737  // PDom
738  // '=' : Branch taken for that CFG edge
739  // In the second case, Placing Succ while duplicating it into C prevents the
740  // fallthrough of Succ into either D or PDom, because they now have C as an
741  // unplaced predecessor
742 
743  // Start by figuring out which case we fall into
744  MachineBasicBlock *PDom = nullptr;
746  // Only scan the relevant successors
747  auto AdjustedSuccSumProb =
748  collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
749  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
750  auto BBFreq = MBFI->getBlockFreq(BB);
751  auto SuccFreq = MBFI->getBlockFreq(Succ);
752  BlockFrequency P = BBFreq * PProb;
753  BlockFrequency Qout = BBFreq * QProb;
754  uint64_t EntryFreq = MBFI->getEntryFreq();
755  // If there are no more successors, it is profitable to copy, as it strictly
756  // increases fallthrough.
757  if (SuccSuccs.size() == 0)
758  return greaterWithBias(P, Qout, EntryFreq);
759 
760  auto BestSuccSucc = BranchProbability::getZero();
761  // Find the PDom or the best Succ if no PDom exists.
762  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
763  auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
764  if (Prob > BestSuccSucc)
765  BestSuccSucc = Prob;
766  if (PDom == nullptr)
767  if (MPDT->dominates(SuccSucc, Succ)) {
768  PDom = SuccSucc;
769  break;
770  }
771  }
772  // For the comparisons, we need to know Succ's best incoming edge that isn't
773  // from BB.
774  auto SuccBestPred = BlockFrequency(0);
775  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
776  if (SuccPred == Succ || SuccPred == BB
777  || BlockToChain[SuccPred] == &Chain
778  || (BlockFilter && !BlockFilter->count(SuccPred)))
779  continue;
780  auto Freq = MBFI->getBlockFreq(SuccPred)
781  * MBPI->getEdgeProbability(SuccPred, Succ);
782  if (Freq > SuccBestPred)
783  SuccBestPred = Freq;
784  }
785  // Qin is Succ's best unplaced incoming edge that isn't BB
786  BlockFrequency Qin = SuccBestPred;
787  // If it doesn't have a post-dominating successor, here is the calculation:
788  // BB BB
789  // | \Qout | \
790  // P| C | =
791  // = C' | C
792  // | /Qin | |
793  // | / | C' (+Succ)
794  // Succ Succ /|
795  // / \ | \/ |
796  // U/ =V | == |
797  // / \ | / \|
798  // D E D E
799  // '=' : Branch taken for that CFG edge
800  // Cost in the first case is: P + V
801  // For this calculation, we always assume P > Qout. If Qout > P
802  // The result of this function will be ignored at the caller.
803  // Let F = SuccFreq - Qin
804  // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
805 
806  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
807  BranchProbability UProb = BestSuccSucc;
808  BranchProbability VProb = AdjustedSuccSumProb - UProb;
809  BlockFrequency F = SuccFreq - Qin;
810  BlockFrequency V = SuccFreq * VProb;
811  BlockFrequency QinU = std::min(Qin, F) * UProb;
812  BlockFrequency BaseCost = P + V;
813  BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
814  return greaterWithBias(BaseCost, DupCost, EntryFreq);
815  }
816  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
817  BranchProbability VProb = AdjustedSuccSumProb - UProb;
818  BlockFrequency U = SuccFreq * UProb;
819  BlockFrequency V = SuccFreq * VProb;
820  BlockFrequency F = SuccFreq - Qin;
821  // If there is a post-dominating successor, here is the calculation:
822  // BB BB BB BB
823  // | \Qout | \ | \Qout | \
824  // |P C | = |P C | =
825  // = C' |P C = C' |P C
826  // | /Qin | | | /Qin | |
827  // | / | C' (+Succ) | / | C' (+Succ)
828  // Succ Succ /| Succ Succ /|
829  // | \ V | \/ | | \ V | \/ |
830  // |U \ |U /\ =? |U = |U /\ |
831  // = D = = =?| | D | = =|
832  // | / |/ D | / |/ D
833  // | / | / | = | /
834  // |/ | / |/ | =
835  // Dom Dom Dom Dom
836  // '=' : Branch taken for that CFG edge
837  // The cost for taken branches in the first case is P + U
838  // Let F = SuccFreq - Qin
839  // The cost in the second case (assuming independence), given the layout:
840  // BB, Succ, (C+Succ), D, Dom or the layout:
841  // BB, Succ, D, Dom, (C+Succ)
842  // is Qout + max(F, Qin) * U + min(F, Qin)
843  // compare P + U vs Qout + P * U + Qin.
844  //
845  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
846  //
847  // For the 3rd case, the cost is P + 2 * V
848  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
849  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
850  if (UProb > AdjustedSuccSumProb / 2 &&
851  !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
852  Chain, BlockFilter))
853  // Cases 3 & 4
854  return greaterWithBias(
855  (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
856  EntryFreq);
857  // Cases 1 & 2
858  return greaterWithBias((P + U),
859  (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
860  std::max(Qin, F) * UProb),
861  EntryFreq);
862 }
863 
864 /// Check for a trellis layout. \p BB is the upper part of a trellis if its
865 /// successors form the lower part of a trellis. A successor set S forms the
866 /// lower part of a trellis if all of the predecessors of S are either in S or
867 /// have all of S as successors. We ignore trellises where BB doesn't have 2
868 /// successors because for fewer than 2, it's trivial, and for 3 or greater they
869 /// are very uncommon and complex to compute optimally. Allowing edges within S
870 /// is not strictly a trellis, but the same algorithm works, so we allow it.
871 bool MachineBlockPlacement::isTrellis(
872  const MachineBasicBlock *BB,
873  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
874  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
875  // Technically BB could form a trellis with branching factor higher than 2.
876  // But that's extremely uncommon.
877  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
878  return false;
879 
881  BB->succ_end());
882  // To avoid reviewing the same predecessors twice.
884 
885  for (MachineBasicBlock *Succ : ViableSuccs) {
886  int PredCount = 0;
887  for (auto SuccPred : Succ->predecessors()) {
888  // Allow triangle successors, but don't count them.
889  if (Successors.count(SuccPred)) {
890  // Make sure that it is actually a triangle.
891  for (MachineBasicBlock *CheckSucc : SuccPred->successors())
892  if (!Successors.count(CheckSucc))
893  return false;
894  continue;
895  }
896  const BlockChain *PredChain = BlockToChain[SuccPred];
897  if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
898  PredChain == &Chain || PredChain == BlockToChain[Succ])
899  continue;
900  ++PredCount;
901  // Perform the successor check only once.
902  if (!SeenPreds.insert(SuccPred).second)
903  continue;
904  if (!hasSameSuccessors(*SuccPred, Successors))
905  return false;
906  }
907  // If one of the successors has only BB as a predecessor, it is not a
908  // trellis.
909  if (PredCount < 1)
910  return false;
911  }
912  return true;
913 }
914 
915 /// Pick the highest total weight pair of edges that can both be laid out.
916 /// The edges in \p Edges[0] are assumed to have a different destination than
917 /// the edges in \p Edges[1]. Simple counting shows that the best pair is either
918 /// the individual highest weight edges to the 2 different destinations, or in
919 /// case of a conflict, one of them should be replaced with a 2nd best edge.
920 std::pair<MachineBlockPlacement::WeightedEdge,
921  MachineBlockPlacement::WeightedEdge>
922 MachineBlockPlacement::getBestNonConflictingEdges(
923  const MachineBasicBlock *BB,
925  Edges) {
926  // Sort the edges, and then for each successor, find the best incoming
927  // predecessor. If the best incoming predecessors aren't the same,
928  // then that is clearly the best layout. If there is a conflict, one of the
929  // successors will have to fallthrough from the second best predecessor. We
930  // compare which combination is better overall.
931 
932  // Sort for highest frequency.
933  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
934 
935  std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp);
936  std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp);
937  auto BestA = Edges[0].begin();
938  auto BestB = Edges[1].begin();
939  // Arrange for the correct answer to be in BestA and BestB
940  // If the 2 best edges don't conflict, the answer is already there.
941  if (BestA->Src == BestB->Src) {
942  // Compare the total fallthrough of (Best + Second Best) for both pairs
943  auto SecondBestA = std::next(BestA);
944  auto SecondBestB = std::next(BestB);
945  BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
946  BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
947  if (BestAScore < BestBScore)
948  BestA = SecondBestA;
949  else
950  BestB = SecondBestB;
951  }
952  // Arrange for the BB edge to be in BestA if it exists.
953  if (BestB->Src == BB)
954  std::swap(BestA, BestB);
955  return std::make_pair(*BestA, *BestB);
956 }
957 
958 /// Get the best successor from \p BB based on \p BB being part of a trellis.
959 /// We only handle trellises with 2 successors, so the algorithm is
960 /// straightforward: Find the best pair of edges that don't conflict. We find
961 /// the best incoming edge for each successor in the trellis. If those conflict,
962 /// we consider which of them should be replaced with the second best.
963 /// Upon return the two best edges will be in \p BestEdges. If one of the edges
964 /// comes from \p BB, it will be in \p BestEdges[0]
965 MachineBlockPlacement::BlockAndTailDupResult
966 MachineBlockPlacement::getBestTrellisSuccessor(
967  const MachineBasicBlock *BB,
968  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
969  BranchProbability AdjustedSumProb, const BlockChain &Chain,
970  const BlockFilterSet *BlockFilter) {
971 
972  BlockAndTailDupResult Result = {nullptr, false};
974  BB->succ_end());
975 
976  // We assume size 2 because it's common. For general n, we would have to do
977  // the Hungarian algorithm, but it's not worth the complexity because more
978  // than 2 successors is fairly uncommon, and a trellis even more so.
979  if (Successors.size() != 2 || ViableSuccs.size() != 2)
980  return Result;
981 
982  // Collect the edge frequencies of all edges that form the trellis.
984  int SuccIndex = 0;
985  for (auto Succ : ViableSuccs) {
986  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
987  // Skip any placed predecessors that are not BB
988  if (SuccPred != BB)
989  if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
990  BlockToChain[SuccPred] == &Chain ||
991  BlockToChain[SuccPred] == BlockToChain[Succ])
992  continue;
993  BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
994  MBPI->getEdgeProbability(SuccPred, Succ);
995  Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
996  }
997  ++SuccIndex;
998  }
999 
1000  // Pick the best combination of 2 edges from all the edges in the trellis.
1001  WeightedEdge BestA, BestB;
1002  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1003 
1004  if (BestA.Src != BB) {
1005  // If we have a trellis, and BB doesn't have the best fallthrough edges,
1006  // we shouldn't choose any successor. We've already looked and there's a
1007  // better fallthrough edge for all the successors.
1008  DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1009  return Result;
1010  }
1011 
1012  // Did we pick the triangle edge? If tail-duplication is profitable, do
1013  // that instead. Otherwise merge the triangle edge now while we know it is
1014  // optimal.
1015  if (BestA.Dest == BestB.Src) {
1016  // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1017  // would be better.
1018  MachineBasicBlock *Succ1 = BestA.Dest;
1019  MachineBasicBlock *Succ2 = BestB.Dest;
1020  // Check to see if tail-duplication would be profitable.
1021  if (TailDupPlacement && shouldTailDuplicate(Succ2) &&
1022  canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1023  isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1024  Chain, BlockFilter)) {
1026  MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1027  dbgs() << " Selected: " << getBlockName(Succ2)
1028  << ", probability: " << Succ2Prob << " (Tail Duplicate)\n");
1029  Result.BB = Succ2;
1030  Result.ShouldTailDup = true;
1031  return Result;
1032  }
1033  }
1034  // We have already computed the optimal edge for the other side of the
1035  // trellis.
1036  ComputedEdges[BestB.Src] = { BestB.Dest, false };
1037 
1038  auto TrellisSucc = BestA.Dest;
1040  MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1041  dbgs() << " Selected: " << getBlockName(TrellisSucc)
1042  << ", probability: " << SuccProb << " (Trellis)\n");
1043  Result.BB = TrellisSucc;
1044  return Result;
1045 }
1046 
1047 /// When the option TailDupPlacement is on, this method checks if the
1048 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1049 /// into all of its unplaced, unfiltered predecessors, that are not BB.
1050 bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1051  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1052  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1053  if (!shouldTailDuplicate(Succ))
1054  return false;
1055 
1056  // For CFG checking.
1058  BB->succ_end());
1059  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1060  // Make sure all unplaced and unfiltered predecessors can be
1061  // tail-duplicated into.
1062  // Skip any blocks that are already placed or not in this loop.
1063  if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
1064  || BlockToChain[Pred] == &Chain)
1065  continue;
1066  if (!TailDup.canTailDuplicate(Succ, Pred)) {
1067  if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1068  // This will result in a trellis after tail duplication, so we don't
1069  // need to copy Succ into this predecessor. In the presence
1070  // of a trellis tail duplication can continue to be profitable.
1071  // For example:
1072  // A A
1073  // |\ |\
1074  // | \ | \
1075  // | C | C+BB
1076  // | / | |
1077  // |/ | |
1078  // BB => BB |
1079  // |\ |\/|
1080  // | \ |/\|
1081  // | D | D
1082  // | / | /
1083  // |/ |/
1084  // Succ Succ
1085  //
1086  // After BB was duplicated into C, the layout looks like the one on the
1087  // right. BB and C now have the same successors. When considering
1088  // whether Succ can be duplicated into all its unplaced predecessors, we
1089  // ignore C.
1090  // We can do this because C already has a profitable fallthrough, namely
1091  // D. TODO(iteratee): ignore sufficiently cold predecessors for
1092  // duplication and for this test.
1093  //
1094  // This allows trellises to be laid out in 2 separate chains
1095  // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1096  // because it allows the creation of 2 fallthrough paths with links
1097  // between them, and we correctly identify the best layout for these
1098  // CFGs. We want to extend trellises that the user created in addition
1099  // to trellises created by tail-duplication, so we just look for the
1100  // CFG.
1101  continue;
1102  return false;
1103  }
1104  }
1105  return true;
1106 }
1107 
1108 /// Find chains of triangles where we believe it would be profitable to
1109 /// tail-duplicate them all, but a local analysis would not find them.
1110 /// There are 3 ways this can be profitable:
1111 /// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1112 /// longer chains)
1113 /// 2) The chains are statically correlated. Branch probabilities have a very
1114 /// U-shaped distribution.
1115 /// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1116 /// If the branches in a chain are likely to be from the same side of the
1117 /// distribution as their predecessor, but are independent at runtime, this
1118 /// transformation is profitable. (Because the cost of being wrong is a small
1119 /// fixed cost, unlike the standard triangle layout where the cost of being
1120 /// wrong scales with the # of triangles.)
1121 /// 3) The chains are dynamically correlated. If the probability that a previous
1122 /// branch was taken positively influences whether the next branch will be
1123 /// taken
1124 /// We believe that 2 and 3 are common enough to justify the small margin in 1.
1125 void MachineBlockPlacement::precomputeTriangleChains() {
1126  struct TriangleChain {
1127  std::vector<MachineBasicBlock *> Edges;
1128 
1129  TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1130  : Edges({src, dst}) {}
1131 
1132  void append(MachineBasicBlock *dst) {
1133  assert(getKey()->isSuccessor(dst) &&
1134  "Attempting to append a block that is not a successor.");
1135  Edges.push_back(dst);
1136  }
1137 
1138  unsigned count() const { return Edges.size() - 1; }
1139 
1140  MachineBasicBlock *getKey() const {
1141  return Edges.back();
1142  }
1143  };
1144 
1145  if (TriangleChainCount == 0)
1146  return;
1147 
1148  DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1149  // Map from last block to the chain that contains it. This allows us to extend
1150  // chains as we find new triangles.
1152  for (MachineBasicBlock &BB : *F) {
1153  // If BB doesn't have 2 successors, it doesn't start a triangle.
1154  if (BB.succ_size() != 2)
1155  continue;
1156  MachineBasicBlock *PDom = nullptr;
1157  for (MachineBasicBlock *Succ : BB.successors()) {
1158  if (!MPDT->dominates(Succ, &BB))
1159  continue;
1160  PDom = Succ;
1161  break;
1162  }
1163  // If BB doesn't have a post-dominating successor, it doesn't form a
1164  // triangle.
1165  if (PDom == nullptr)
1166  continue;
1167  // If PDom has a hint that it is low probability, skip this triangle.
1168  if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1169  continue;
1170  // If PDom isn't eligible for duplication, this isn't the kind of triangle
1171  // we're looking for.
1172  if (!shouldTailDuplicate(PDom))
1173  continue;
1174  bool CanTailDuplicate = true;
1175  // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1176  // isn't the kind of triangle we're looking for.
1177  for (MachineBasicBlock* Pred : PDom->predecessors()) {
1178  if (Pred == &BB)
1179  continue;
1180  if (!TailDup.canTailDuplicate(PDom, Pred)) {
1181  CanTailDuplicate = false;
1182  break;
1183  }
1184  }
1185  // If we can't tail-duplicate PDom to its predecessors, then skip this
1186  // triangle.
1187  if (!CanTailDuplicate)
1188  continue;
1189 
1190  // Now we have an interesting triangle. Insert it if it's not part of an
1191  // existing chain.
1192  // Note: This cannot be replaced with a call insert() or emplace() because
1193  // the find key is BB, but the insert/emplace key is PDom.
1194  auto Found = TriangleChainMap.find(&BB);
1195  // If it is, remove the chain from the map, grow it, and put it back in the
1196  // map with the end as the new key.
1197  if (Found != TriangleChainMap.end()) {
1198  TriangleChain Chain = std::move(Found->second);
1199  TriangleChainMap.erase(Found);
1200  Chain.append(PDom);
1201  TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1202  } else {
1203  auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1204  assert(InsertResult.second && "Block seen twice.");
1205  (void)InsertResult;
1206  }
1207  }
1208 
1209  // Iterating over a DenseMap is safe here, because the only thing in the body
1210  // of the loop is inserting into another DenseMap (ComputedEdges).
1211  // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1212  for (auto &ChainPair : TriangleChainMap) {
1213  TriangleChain &Chain = ChainPair.second;
1214  // Benchmarking has shown that due to branch correlation duplicating 2 or
1215  // more triangles is profitable, despite the calculations assuming
1216  // independence.
1217  if (Chain.count() < TriangleChainCount)
1218  continue;
1219  MachineBasicBlock *dst = Chain.Edges.back();
1220  Chain.Edges.pop_back();
1221  for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1222  DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" <<
1223  getBlockName(dst) << " as pre-computed based on triangles.\n");
1224 
1225  auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1226  assert(InsertResult.second && "Block seen twice.");
1227  (void)InsertResult;
1228 
1229  dst = src;
1230  }
1231  }
1232 }
1233 
1234 // When profile is not present, return the StaticLikelyProb.
1235 // When profile is available, we need to handle the triangle-shape CFG.
1237  const MachineBasicBlock *BB) {
1238  if (!BB->getParent()->getFunction()->getEntryCount())
1239  return BranchProbability(StaticLikelyProb, 100);
1240  if (BB->succ_size() == 2) {
1241  const MachineBasicBlock *Succ1 = *BB->succ_begin();
1242  const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1243  if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1244  /* See case 1 below for the cost analysis. For BB->Succ to
1245  * be taken with smaller cost, the following needs to hold:
1246  * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1247  * So the threshold T in the calculation below
1248  * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1249  * So T / (1 - T) = 2, Yielding T = 2/3
1250  * Also adding user specified branch bias, we have
1251  * T = (2/3)*(ProfileLikelyProb/50)
1252  * = (2*ProfileLikelyProb)/150)
1253  */
1254  return BranchProbability(2 * ProfileLikelyProb, 150);
1255  }
1256  }
1257  return BranchProbability(ProfileLikelyProb, 100);
1258 }
1259 
1260 /// Checks to see if the layout candidate block \p Succ has a better layout
1261 /// predecessor than \c BB. If yes, returns true.
1262 /// \p SuccProb: The probability adjusted for only remaining blocks.
1263 /// Only used for logging
1264 /// \p RealSuccProb: The un-adjusted probability.
1265 /// \p Chain: The chain that BB belongs to and Succ is being considered for.
1266 /// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1267 /// considered
1268 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1269  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1270  const BlockChain &SuccChain, BranchProbability SuccProb,
1271  BranchProbability RealSuccProb, const BlockChain &Chain,
1272  const BlockFilterSet *BlockFilter) {
1273 
1274  // There isn't a better layout when there are no unscheduled predecessors.
1275  if (SuccChain.UnscheduledPredecessors == 0)
1276  return false;
1277 
1278  // There are two basic scenarios here:
1279  // -------------------------------------
1280  // Case 1: triangular shape CFG (if-then):
1281  // BB
1282  // | \
1283  // | \
1284  // | Pred
1285  // | /
1286  // Succ
1287  // In this case, we are evaluating whether to select edge -> Succ, e.g.
1288  // set Succ as the layout successor of BB. Picking Succ as BB's
1289  // successor breaks the CFG constraints (FIXME: define these constraints).
1290  // With this layout, Pred BB
1291  // is forced to be outlined, so the overall cost will be cost of the
1292  // branch taken from BB to Pred, plus the cost of back taken branch
1293  // from Pred to Succ, as well as the additional cost associated
1294  // with the needed unconditional jump instruction from Pred To Succ.
1295 
1296  // The cost of the topological order layout is the taken branch cost
1297  // from BB to Succ, so to make BB->Succ a viable candidate, the following
1298  // must hold:
1299  // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1300  // < freq(BB->Succ) * taken_branch_cost.
1301  // Ignoring unconditional jump cost, we get
1302  // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1303  // prob(BB->Succ) > 2 * prob(BB->Pred)
1304  //
1305  // When real profile data is available, we can precisely compute the
1306  // probability threshold that is needed for edge BB->Succ to be considered.
1307  // Without profile data, the heuristic requires the branch bias to be
1308  // a lot larger to make sure the signal is very strong (e.g. 80% default).
1309  // -----------------------------------------------------------------
1310  // Case 2: diamond like CFG (if-then-else):
1311  // S
1312  // / \
1313  // | \
1314  // BB Pred
1315  // \ /
1316  // Succ
1317  // ..
1318  //
1319  // The current block is BB and edge BB->Succ is now being evaluated.
1320  // Note that edge S->BB was previously already selected because
1321  // prob(S->BB) > prob(S->Pred).
1322  // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1323  // choose Pred, we will have a topological ordering as shown on the left
1324  // in the picture below. If we choose Succ, we have the solution as shown
1325  // on the right:
1326  //
1327  // topo-order:
1328  //
1329  // S----- ---S
1330  // | | | |
1331  // ---BB | | BB
1332  // | | | |
1333  // | Pred-- | Succ--
1334  // | | | |
1335  // ---Succ ---Pred--
1336  //
1337  // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1338  // = freq(S->Pred) + freq(S->BB)
1339  //
1340  // If we have profile data (i.e, branch probabilities can be trusted), the
1341  // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1342  // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1343  // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1344  // means the cost of topological order is greater.
1345  // When profile data is not available, however, we need to be more
1346  // conservative. If the branch prediction is wrong, breaking the topo-order
1347  // will actually yield a layout with large cost. For this reason, we need
1348  // strong biased branch at block S with Prob(S->BB) in order to select
1349  // BB->Succ. This is equivalent to looking the CFG backward with backward
1350  // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1351  // profile data).
1352  // --------------------------------------------------------------------------
1353  // Case 3: forked diamond
1354  // S
1355  // / \
1356  // / \
1357  // BB Pred
1358  // | \ / |
1359  // | \ / |
1360  // | X |
1361  // | / \ |
1362  // | / \ |
1363  // S1 S2
1364  //
1365  // The current block is BB and edge BB->S1 is now being evaluated.
1366  // As above S->BB was already selected because
1367  // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1368  //
1369  // topo-order:
1370  //
1371  // S-------| ---S
1372  // | | | |
1373  // ---BB | | BB
1374  // | | | |
1375  // | Pred----| | S1----
1376  // | | | |
1377  // --(S1 or S2) ---Pred--
1378  // |
1379  // S2
1380  //
1381  // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1382  // + min(freq(Pred->S1), freq(Pred->S2))
1383  // Non-topo-order cost:
1384  // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1385  // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1386  // is 0. Then the non topo layout is better when
1387  // freq(S->Pred) < freq(BB->S1).
1388  // This is exactly what is checked below.
1389  // Note there are other shapes that apply (Pred may not be a single block,
1390  // but they all fit this general pattern.)
1392 
1393  // Make sure that a hot successor doesn't have a globally more
1394  // important predecessor.
1395  BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1396  bool BadCFGConflict = false;
1397 
1398  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1399  if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
1400  (BlockFilter && !BlockFilter->count(Pred)) ||
1401  BlockToChain[Pred] == &Chain ||
1402  // This check is redundant except for look ahead. This function is
1403  // called for lookahead by isProfitableToTailDup when BB hasn't been
1404  // placed yet.
1405  (Pred == BB))
1406  continue;
1407  // Do backward checking.
1408  // For all cases above, we need a backward checking to filter out edges that
1409  // are not 'strongly' biased.
1410  // BB Pred
1411  // \ /
1412  // Succ
1413  // We select edge BB->Succ if
1414  // freq(BB->Succ) > freq(Succ) * HotProb
1415  // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1416  // HotProb
1417  // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1418  // Case 1 is covered too, because the first equation reduces to:
1419  // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1420  BlockFrequency PredEdgeFreq =
1421  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1422  if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1423  BadCFGConflict = true;
1424  break;
1425  }
1426  }
1427 
1428  if (BadCFGConflict) {
1429  DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> " << SuccProb
1430  << " (prob) (non-cold CFG conflict)\n");
1431  return true;
1432  }
1433 
1434  return false;
1435 }
1436 
1437 /// \brief Select the best successor for a block.
1438 ///
1439 /// This looks across all successors of a particular block and attempts to
1440 /// select the "best" one to be the layout successor. It only considers direct
1441 /// successors which also pass the block filter. It will attempt to avoid
1442 /// breaking CFG structure, but cave and break such structures in the case of
1443 /// very hot successor edges.
1444 ///
1445 /// \returns The best successor block found, or null if none are viable, along
1446 /// with a boolean indicating if tail duplication is necessary.
1447 MachineBlockPlacement::BlockAndTailDupResult
1448 MachineBlockPlacement::selectBestSuccessor(
1449  const MachineBasicBlock *BB, const BlockChain &Chain,
1450  const BlockFilterSet *BlockFilter) {
1451  const BranchProbability HotProb(StaticLikelyProb, 100);
1452 
1453  BlockAndTailDupResult BestSucc = { nullptr, false };
1454  auto BestProb = BranchProbability::getZero();
1455 
1457  auto AdjustedSumProb =
1458  collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1459 
1460  DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n");
1461 
1462  // if we already precomputed the best successor for BB, return that if still
1463  // applicable.
1464  auto FoundEdge = ComputedEdges.find(BB);
1465  if (FoundEdge != ComputedEdges.end()) {
1466  MachineBasicBlock *Succ = FoundEdge->second.BB;
1467  ComputedEdges.erase(FoundEdge);
1468  BlockChain *SuccChain = BlockToChain[Succ];
1469  if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1470  SuccChain != &Chain && Succ == *SuccChain->begin())
1471  return FoundEdge->second;
1472  }
1473 
1474  // if BB is part of a trellis, Use the trellis to determine the optimal
1475  // fallthrough edges
1476  if (isTrellis(BB, Successors, Chain, BlockFilter))
1477  return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1478  BlockFilter);
1479 
1480  // For blocks with CFG violations, we may be able to lay them out anyway with
1481  // tail-duplication. We keep this vector so we can perform the probability
1482  // calculations the minimum number of times.
1484  DupCandidates;
1485  for (MachineBasicBlock *Succ : Successors) {
1486  auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1487  BranchProbability SuccProb =
1488  getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1489 
1490  BlockChain &SuccChain = *BlockToChain[Succ];
1491  // Skip the edge \c BB->Succ if block \c Succ has a better layout
1492  // predecessor that yields lower global cost.
1493  if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1494  Chain, BlockFilter)) {
1495  // If tail duplication would make Succ profitable, place it.
1496  if (TailDupPlacement && shouldTailDuplicate(Succ))
1497  DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
1498  continue;
1499  }
1500 
1501  DEBUG(
1502  dbgs() << " Candidate: " << getBlockName(Succ) << ", probability: "
1503  << SuccProb
1504  << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1505  << "\n");
1506 
1507  if (BestSucc.BB && BestProb >= SuccProb) {
1508  DEBUG(dbgs() << " Not the best candidate, continuing\n");
1509  continue;
1510  }
1511 
1512  DEBUG(dbgs() << " Setting it as best candidate\n");
1513  BestSucc.BB = Succ;
1514  BestProb = SuccProb;
1515  }
1516  // Handle the tail duplication candidates in order of decreasing probability.
1517  // Stop at the first one that is profitable. Also stop if they are less
1518  // profitable than BestSucc. Position is important because we preserve it and
1519  // prefer first best match. Here we aren't comparing in order, so we capture
1520  // the position instead.
1521  if (DupCandidates.size() != 0) {
1522  auto cmp =
1523  [](const std::tuple<BranchProbability, MachineBasicBlock *> &a,
1524  const std::tuple<BranchProbability, MachineBasicBlock *> &b) {
1525  return std::get<0>(a) > std::get<0>(b);
1526  };
1527  std::stable_sort(DupCandidates.begin(), DupCandidates.end(), cmp);
1528  }
1529  for(auto &Tup : DupCandidates) {
1530  BranchProbability DupProb;
1531  MachineBasicBlock *Succ;
1532  std::tie(DupProb, Succ) = Tup;
1533  if (DupProb < BestProb)
1534  break;
1535  if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1536  && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1537  DEBUG(
1538  dbgs() << " Candidate: " << getBlockName(Succ) << ", probability: "
1539  << DupProb
1540  << " (Tail Duplicate)\n");
1541  BestSucc.BB = Succ;
1542  BestSucc.ShouldTailDup = true;
1543  break;
1544  }
1545  }
1546 
1547  if (BestSucc.BB)
1548  DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1549 
1550  return BestSucc;
1551 }
1552 
1553 /// \brief Select the best block from a worklist.
1554 ///
1555 /// This looks through the provided worklist as a list of candidate basic
1556 /// blocks and select the most profitable one to place. The definition of
1557 /// profitable only really makes sense in the context of a loop. This returns
1558 /// the most frequently visited block in the worklist, which in the case of
1559 /// a loop, is the one most desirable to be physically close to the rest of the
1560 /// loop body in order to improve i-cache behavior.
1561 ///
1562 /// \returns The best block found, or null if none are viable.
1563 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1564  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1565  // Once we need to walk the worklist looking for a candidate, cleanup the
1566  // worklist of already placed entries.
1567  // FIXME: If this shows up on profiles, it could be folded (at the cost of
1568  // some code complexity) into the loop below.
1569  WorkList.erase(llvm::remove_if(WorkList,
1570  [&](MachineBasicBlock *BB) {
1571  return BlockToChain.lookup(BB) == &Chain;
1572  }),
1573  WorkList.end());
1574 
1575  if (WorkList.empty())
1576  return nullptr;
1577 
1578  bool IsEHPad = WorkList[0]->isEHPad();
1579 
1580  MachineBasicBlock *BestBlock = nullptr;
1581  BlockFrequency BestFreq;
1582  for (MachineBasicBlock *MBB : WorkList) {
1583  assert(MBB->isEHPad() == IsEHPad &&
1584  "EHPad mismatch between block and work list.");
1585 
1586  BlockChain &SuccChain = *BlockToChain[MBB];
1587  if (&SuccChain == &Chain)
1588  continue;
1589 
1590  assert(SuccChain.UnscheduledPredecessors == 0 &&
1591  "Found CFG-violating block");
1592 
1593  BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1594  DEBUG(dbgs() << " " << getBlockName(MBB) << " -> ";
1595  MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
1596 
1597  // For ehpad, we layout the least probable first as to avoid jumping back
1598  // from least probable landingpads to more probable ones.
1599  //
1600  // FIXME: Using probability is probably (!) not the best way to achieve
1601  // this. We should probably have a more principled approach to layout
1602  // cleanup code.
1603  //
1604  // The goal is to get:
1605  //
1606  // +--------------------------+
1607  // | V
1608  // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1609  //
1610  // Rather than:
1611  //
1612  // +-------------------------------------+
1613  // V |
1614  // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1615  if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1616  continue;
1617 
1618  BestBlock = MBB;
1619  BestFreq = CandidateFreq;
1620  }
1621 
1622  return BestBlock;
1623 }
1624 
1625 /// \brief Retrieve the first unplaced basic block.
1626 ///
1627 /// This routine is called when we are unable to use the CFG to walk through
1628 /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1629 /// We walk through the function's blocks in order, starting from the
1630 /// LastUnplacedBlockIt. We update this iterator on each call to avoid
1631 /// re-scanning the entire sequence on repeated calls to this routine.
1632 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1633  const BlockChain &PlacedChain,
1634  MachineFunction::iterator &PrevUnplacedBlockIt,
1635  const BlockFilterSet *BlockFilter) {
1636  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1637  ++I) {
1638  if (BlockFilter && !BlockFilter->count(&*I))
1639  continue;
1640  if (BlockToChain[&*I] != &PlacedChain) {
1641  PrevUnplacedBlockIt = I;
1642  // Now select the head of the chain to which the unplaced block belongs
1643  // as the block to place. This will force the entire chain to be placed,
1644  // and satisfies the requirements of merging chains.
1645  return *BlockToChain[&*I]->begin();
1646  }
1647  }
1648  return nullptr;
1649 }
1650 
1651 void MachineBlockPlacement::fillWorkLists(
1652  const MachineBasicBlock *MBB,
1653  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1654  const BlockFilterSet *BlockFilter = nullptr) {
1655  BlockChain &Chain = *BlockToChain[MBB];
1656  if (!UpdatedPreds.insert(&Chain).second)
1657  return;
1658 
1659  assert(
1660  Chain.UnscheduledPredecessors == 0 &&
1661  "Attempting to place block with unscheduled predecessors in worklist.");
1662  for (MachineBasicBlock *ChainBB : Chain) {
1663  assert(BlockToChain[ChainBB] == &Chain &&
1664  "Block in chain doesn't match BlockToChain map.");
1665  for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1666  if (BlockFilter && !BlockFilter->count(Pred))
1667  continue;
1668  if (BlockToChain[Pred] == &Chain)
1669  continue;
1670  ++Chain.UnscheduledPredecessors;
1671  }
1672  }
1673 
1674  if (Chain.UnscheduledPredecessors != 0)
1675  return;
1676 
1677  MachineBasicBlock *BB = *Chain.begin();
1678  if (BB->isEHPad())
1679  EHPadWorkList.push_back(BB);
1680  else
1681  BlockWorkList.push_back(BB);
1682 }
1683 
1684 void MachineBlockPlacement::buildChain(
1685  const MachineBasicBlock *HeadBB, BlockChain &Chain,
1686  BlockFilterSet *BlockFilter) {
1687  assert(HeadBB && "BB must not be null.\n");
1688  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1689  MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1690 
1691  const MachineBasicBlock *LoopHeaderBB = HeadBB;
1692  markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1693  MachineBasicBlock *BB = *std::prev(Chain.end());
1694  while (true) {
1695  assert(BB && "null block found at end of chain in loop.");
1696  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1697  assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1698 
1699 
1700  // Look for the best viable successor if there is one to place immediately
1701  // after this block.
1702  auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1703  MachineBasicBlock* BestSucc = Result.BB;
1704  bool ShouldTailDup = Result.ShouldTailDup;
1705  if (TailDupPlacement)
1706  ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc));
1707 
1708  // If an immediate successor isn't available, look for the best viable
1709  // block among those we've identified as not violating the loop's CFG at
1710  // this point. This won't be a fallthrough, but it will increase locality.
1711  if (!BestSucc)
1712  BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1713  if (!BestSucc)
1714  BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1715 
1716  if (!BestSucc) {
1717  BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
1718  if (!BestSucc)
1719  break;
1720 
1721  DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1722  "layout successor until the CFG reduces\n");
1723  }
1724 
1725  // Placement may have changed tail duplication opportunities.
1726  // Check for that now.
1727  if (TailDupPlacement && BestSucc && ShouldTailDup) {
1728  // If the chosen successor was duplicated into all its predecessors,
1729  // don't bother laying it out, just go round the loop again with BB as
1730  // the chain end.
1731  if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1732  BlockFilter, PrevUnplacedBlockIt))
1733  continue;
1734  }
1735 
1736  // Place this block, updating the datastructures to reflect its placement.
1737  BlockChain &SuccChain = *BlockToChain[BestSucc];
1738  // Zero out UnscheduledPredecessors for the successor we're about to merge in case
1739  // we selected a successor that didn't fit naturally into the CFG.
1740  SuccChain.UnscheduledPredecessors = 0;
1741  DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1742  << getBlockName(BestSucc) << "\n");
1743  markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1744  Chain.merge(BestSucc, &SuccChain);
1745  BB = *std::prev(Chain.end());
1746  }
1747 
1748  DEBUG(dbgs() << "Finished forming chain for header block "
1749  << getBlockName(*Chain.begin()) << "\n");
1750 }
1751 
1752 /// \brief Find the best loop top block for layout.
1753 ///
1754 /// Look for a block which is strictly better than the loop header for laying
1755 /// out at the top of the loop. This looks for one and only one pattern:
1756 /// a latch block with no conditional exit. This block will cause a conditional
1757 /// jump around it or will be the bottom of the loop if we lay it out in place,
1758 /// but if it it doesn't end up at the bottom of the loop for any reason,
1759 /// rotation alone won't fix it. Because such a block will always result in an
1760 /// unconditional jump (for the backedge) rotating it in front of the loop
1761 /// header is always profitable.
1763 MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
1764  const BlockFilterSet &LoopBlockSet) {
1765  // Placing the latch block before the header may introduce an extra branch
1766  // that skips this block the first time the loop is executed, which we want
1767  // to avoid when optimising for size.
1768  // FIXME: in theory there is a case that does not introduce a new branch,
1769  // i.e. when the layout predecessor does not fallthrough to the loop header.
1770  // In practice this never happens though: there always seems to be a preheader
1771  // that can fallthrough and that is also placed before the header.
1772  if (F->getFunction()->optForSize())
1773  return L.getHeader();
1774 
1775  // Check that the header hasn't been fused with a preheader block due to
1776  // crazy branches. If it has, we need to start with the header at the top to
1777  // prevent pulling the preheader into the loop body.
1778  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
1779  if (!LoopBlockSet.count(*HeaderChain.begin()))
1780  return L.getHeader();
1781 
1782  DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(L.getHeader())
1783  << "\n");
1784 
1785  BlockFrequency BestPredFreq;
1786  MachineBasicBlock *BestPred = nullptr;
1787  for (MachineBasicBlock *Pred : L.getHeader()->predecessors()) {
1788  if (!LoopBlockSet.count(Pred))
1789  continue;
1790  DEBUG(dbgs() << " header pred: " << getBlockName(Pred) << ", has "
1791  << Pred->succ_size() << " successors, ";
1792  MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
1793  if (Pred->succ_size() > 1)
1794  continue;
1795 
1796  BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
1797  if (!BestPred || PredFreq > BestPredFreq ||
1798  (!(PredFreq < BestPredFreq) &&
1799  Pred->isLayoutSuccessor(L.getHeader()))) {
1800  BestPred = Pred;
1801  BestPredFreq = PredFreq;
1802  }
1803  }
1804 
1805  // If no direct predecessor is fine, just use the loop header.
1806  if (!BestPred) {
1807  DEBUG(dbgs() << " final top unchanged\n");
1808  return L.getHeader();
1809  }
1810 
1811  // Walk backwards through any straight line of predecessors.
1812  while (BestPred->pred_size() == 1 &&
1813  (*BestPred->pred_begin())->succ_size() == 1 &&
1814  *BestPred->pred_begin() != L.getHeader())
1815  BestPred = *BestPred->pred_begin();
1816 
1817  DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
1818  return BestPred;
1819 }
1820 
1821 /// \brief Find the best loop exiting block for layout.
1822 ///
1823 /// This routine implements the logic to analyze the loop looking for the best
1824 /// block to layout at the top of the loop. Typically this is done to maximize
1825 /// fallthrough opportunities.
1827 MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
1828  const BlockFilterSet &LoopBlockSet) {
1829  // We don't want to layout the loop linearly in all cases. If the loop header
1830  // is just a normal basic block in the loop, we want to look for what block
1831  // within the loop is the best one to layout at the top. However, if the loop
1832  // header has be pre-merged into a chain due to predecessors not having
1833  // analyzable branches, *and* the predecessor it is merged with is *not* part
1834  // of the loop, rotating the header into the middle of the loop will create
1835  // a non-contiguous range of blocks which is Very Bad. So start with the
1836  // header and only rotate if safe.
1837  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
1838  if (!LoopBlockSet.count(*HeaderChain.begin()))
1839  return nullptr;
1840 
1841  BlockFrequency BestExitEdgeFreq;
1842  unsigned BestExitLoopDepth = 0;
1843  MachineBasicBlock *ExitingBB = nullptr;
1844  // If there are exits to outer loops, loop rotation can severely limit
1845  // fallthrough opportunities unless it selects such an exit. Keep a set of
1846  // blocks where rotating to exit with that block will reach an outer loop.
1847  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
1848 
1849  DEBUG(dbgs() << "Finding best loop exit for: " << getBlockName(L.getHeader())
1850  << "\n");
1851  for (MachineBasicBlock *MBB : L.getBlocks()) {
1852  BlockChain &Chain = *BlockToChain[MBB];
1853  // Ensure that this block is at the end of a chain; otherwise it could be
1854  // mid-way through an inner loop or a successor of an unanalyzable branch.
1855  if (MBB != *std::prev(Chain.end()))
1856  continue;
1857 
1858  // Now walk the successors. We need to establish whether this has a viable
1859  // exiting successor and whether it has a viable non-exiting successor.
1860  // We store the old exiting state and restore it if a viable looping
1861  // successor isn't found.
1862  MachineBasicBlock *OldExitingBB = ExitingBB;
1863  BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
1864  bool HasLoopingSucc = false;
1865  for (MachineBasicBlock *Succ : MBB->successors()) {
1866  if (Succ->isEHPad())
1867  continue;
1868  if (Succ == MBB)
1869  continue;
1870  BlockChain &SuccChain = *BlockToChain[Succ];
1871  // Don't split chains, either this chain or the successor's chain.
1872  if (&Chain == &SuccChain) {
1873  DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
1874  << getBlockName(Succ) << " (chain conflict)\n");
1875  continue;
1876  }
1877 
1878  auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
1879  if (LoopBlockSet.count(Succ)) {
1880  DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
1881  << getBlockName(Succ) << " (" << SuccProb << ")\n");
1882  HasLoopingSucc = true;
1883  continue;
1884  }
1885 
1886  unsigned SuccLoopDepth = 0;
1887  if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
1888  SuccLoopDepth = ExitLoop->getLoopDepth();
1889  if (ExitLoop->contains(&L))
1890  BlocksExitingToOuterLoop.insert(MBB);
1891  }
1892 
1893  BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
1894  DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
1895  << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] (";
1896  MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
1897  // Note that we bias this toward an existing layout successor to retain
1898  // incoming order in the absence of better information. The exit must have
1899  // a frequency higher than the current exit before we consider breaking
1900  // the layout.
1901  BranchProbability Bias(100 - ExitBlockBias, 100);
1902  if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
1903  ExitEdgeFreq > BestExitEdgeFreq ||
1904  (MBB->isLayoutSuccessor(Succ) &&
1905  !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
1906  BestExitEdgeFreq = ExitEdgeFreq;
1907  ExitingBB = MBB;
1908  }
1909  }
1910 
1911  if (!HasLoopingSucc) {
1912  // Restore the old exiting state, no viable looping successor was found.
1913  ExitingBB = OldExitingBB;
1914  BestExitEdgeFreq = OldBestExitEdgeFreq;
1915  }
1916  }
1917  // Without a candidate exiting block or with only a single block in the
1918  // loop, just use the loop header to layout the loop.
1919  if (!ExitingBB) {
1920  DEBUG(dbgs() << " No other candidate exit blocks, using loop header\n");
1921  return nullptr;
1922  }
1923  if (L.getNumBlocks() == 1) {
1924  DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
1925  return nullptr;
1926  }
1927 
1928  // Also, if we have exit blocks which lead to outer loops but didn't select
1929  // one of them as the exiting block we are rotating toward, disable loop
1930  // rotation altogether.
1931  if (!BlocksExitingToOuterLoop.empty() &&
1932  !BlocksExitingToOuterLoop.count(ExitingBB))
1933  return nullptr;
1934 
1935  DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB) << "\n");
1936  return ExitingBB;
1937 }
1938 
1939 /// \brief Attempt to rotate an exiting block to the bottom of the loop.
1940 ///
1941 /// Once we have built a chain, try to rotate it to line up the hot exit block
1942 /// with fallthrough out of the loop if doing so doesn't introduce unnecessary
1943 /// branches. For example, if the loop has fallthrough into its header and out
1944 /// of its bottom already, don't rotate it.
1945 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
1946  const MachineBasicBlock *ExitingBB,
1947  const BlockFilterSet &LoopBlockSet) {
1948  if (!ExitingBB)
1949  return;
1950 
1951  MachineBasicBlock *Top = *LoopChain.begin();
1952  MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
1953 
1954  // If ExitingBB is already the last one in a chain then nothing to do.
1955  if (Bottom == ExitingBB)
1956  return;
1957 
1958  bool ViableTopFallthrough = false;
1959  for (MachineBasicBlock *Pred : Top->predecessors()) {
1960  BlockChain *PredChain = BlockToChain[Pred];
1961  if (!LoopBlockSet.count(Pred) &&
1962  (!PredChain || Pred == *std::prev(PredChain->end()))) {
1963  ViableTopFallthrough = true;
1964  break;
1965  }
1966  }
1967 
1968  // If the header has viable fallthrough, check whether the current loop
1969  // bottom is a viable exiting block. If so, bail out as rotating will
1970  // introduce an unnecessary branch.
1971  if (ViableTopFallthrough) {
1972  for (MachineBasicBlock *Succ : Bottom->successors()) {
1973  BlockChain *SuccChain = BlockToChain[Succ];
1974  if (!LoopBlockSet.count(Succ) &&
1975  (!SuccChain || Succ == *SuccChain->begin()))
1976  return;
1977  }
1978  }
1979 
1980  BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
1981  if (ExitIt == LoopChain.end())
1982  return;
1983 
1984  // Rotating a loop exit to the bottom when there is a fallthrough to top
1985  // trades the entry fallthrough for an exit fallthrough.
1986  // If there is no bottom->top edge, but the chosen exit block does have
1987  // a fallthrough, we break that fallthrough for nothing in return.
1988 
1989  // Let's consider an example. We have a built chain of basic blocks
1990  // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
1991  // By doing a rotation we get
1992  // Bk+1, ..., Bn, B1, ..., Bk
1993  // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
1994  // If we had a fallthrough Bk -> Bk+1 it is broken now.
1995  // It might be compensated by fallthrough Bn -> B1.
1996  // So we have a condition to avoid creation of extra branch by loop rotation.
1997  // All below must be true to avoid loop rotation:
1998  // If there is a fallthrough to top (B1)
1999  // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2000  // There is no fallthrough from bottom (Bn) to top (B1).
2001  // Please note that there is no exit fallthrough from Bn because we checked it
2002  // above.
2003  if (ViableTopFallthrough) {
2004  assert(std::next(ExitIt) != LoopChain.end() &&
2005  "Exit should not be last BB");
2006  MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2007  if (ExitingBB->isSuccessor(NextBlockInChain))
2008  if (!Bottom->isSuccessor(Top))
2009  return;
2010  }
2011 
2012  DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2013  << " at bottom\n");
2014  std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2015 }
2016 
2017 /// \brief Attempt to rotate a loop based on profile data to reduce branch cost.
2018 ///
2019 /// With profile data, we can determine the cost in terms of missed fall through
2020 /// opportunities when rotating a loop chain and select the best rotation.
2021 /// Basically, there are three kinds of cost to consider for each rotation:
2022 /// 1. The possibly missed fall through edge (if it exists) from BB out of
2023 /// the loop to the loop header.
2024 /// 2. The possibly missed fall through edges (if they exist) from the loop
2025 /// exits to BB out of the loop.
2026 /// 3. The missed fall through edge (if it exists) from the last BB to the
2027 /// first BB in the loop chain.
2028 /// Therefore, the cost for a given rotation is the sum of costs listed above.
2029 /// We select the best rotation with the smallest cost.
2030 void MachineBlockPlacement::rotateLoopWithProfile(
2031  BlockChain &LoopChain, const MachineLoop &L,
2032  const BlockFilterSet &LoopBlockSet) {
2033  auto HeaderBB = L.getHeader();
2034  auto HeaderIter = llvm::find(LoopChain, HeaderBB);
2035  auto RotationPos = LoopChain.end();
2036 
2037  BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
2038 
2039  // A utility lambda that scales up a block frequency by dividing it by a
2040  // branch probability which is the reciprocal of the scale.
2041  auto ScaleBlockFrequency = [](BlockFrequency Freq,
2042  unsigned Scale) -> BlockFrequency {
2043  if (Scale == 0)
2044  return 0;
2045  // Use operator / between BlockFrequency and BranchProbability to implement
2046  // saturating multiplication.
2047  return Freq / BranchProbability(1, Scale);
2048  };
2049 
2050  // Compute the cost of the missed fall-through edge to the loop header if the
2051  // chain head is not the loop header. As we only consider natural loops with
2052  // single header, this computation can be done only once.
2053  BlockFrequency HeaderFallThroughCost(0);
2054  for (auto *Pred : HeaderBB->predecessors()) {
2055  BlockChain *PredChain = BlockToChain[Pred];
2056  if (!LoopBlockSet.count(Pred) &&
2057  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2058  auto EdgeFreq =
2059  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB);
2060  auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2061  // If the predecessor has only an unconditional jump to the header, we
2062  // need to consider the cost of this jump.
2063  if (Pred->succ_size() == 1)
2064  FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2065  HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2066  }
2067  }
2068 
2069  // Here we collect all exit blocks in the loop, and for each exit we find out
2070  // its hottest exit edge. For each loop rotation, we define the loop exit cost
2071  // as the sum of frequencies of exit edges we collect here, excluding the exit
2072  // edge from the tail of the loop chain.
2074  for (auto BB : LoopChain) {
2075  auto LargestExitEdgeProb = BranchProbability::getZero();
2076  for (auto *Succ : BB->successors()) {
2077  BlockChain *SuccChain = BlockToChain[Succ];
2078  if (!LoopBlockSet.count(Succ) &&
2079  (!SuccChain || Succ == *SuccChain->begin())) {
2080  auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2081  LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2082  }
2083  }
2084  if (LargestExitEdgeProb > BranchProbability::getZero()) {
2085  auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2086  ExitsWithFreq.emplace_back(BB, ExitFreq);
2087  }
2088  }
2089 
2090  // In this loop we iterate every block in the loop chain and calculate the
2091  // cost assuming the block is the head of the loop chain. When the loop ends,
2092  // we should have found the best candidate as the loop chain's head.
2093  for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2094  EndIter = LoopChain.end();
2095  Iter != EndIter; Iter++, TailIter++) {
2096  // TailIter is used to track the tail of the loop chain if the block we are
2097  // checking (pointed by Iter) is the head of the chain.
2098  if (TailIter == LoopChain.end())
2099  TailIter = LoopChain.begin();
2100 
2101  auto TailBB = *TailIter;
2102 
2103  // Calculate the cost by putting this BB to the top.
2104  BlockFrequency Cost = 0;
2105 
2106  // If the current BB is the loop header, we need to take into account the
2107  // cost of the missed fall through edge from outside of the loop to the
2108  // header.
2109  if (Iter != HeaderIter)
2110  Cost += HeaderFallThroughCost;
2111 
2112  // Collect the loop exit cost by summing up frequencies of all exit edges
2113  // except the one from the chain tail.
2114  for (auto &ExitWithFreq : ExitsWithFreq)
2115  if (TailBB != ExitWithFreq.first)
2116  Cost += ExitWithFreq.second;
2117 
2118  // The cost of breaking the once fall-through edge from the tail to the top
2119  // of the loop chain. Here we need to consider three cases:
2120  // 1. If the tail node has only one successor, then we will get an
2121  // additional jmp instruction. So the cost here is (MisfetchCost +
2122  // JumpInstCost) * tail node frequency.
2123  // 2. If the tail node has two successors, then we may still get an
2124  // additional jmp instruction if the layout successor after the loop
2125  // chain is not its CFG successor. Note that the more frequently executed
2126  // jmp instruction will be put ahead of the other one. Assume the
2127  // frequency of those two branches are x and y, where x is the frequency
2128  // of the edge to the chain head, then the cost will be
2129  // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2130  // 3. If the tail node has more than two successors (this rarely happens),
2131  // we won't consider any additional cost.
2132  if (TailBB->isSuccessor(*Iter)) {
2133  auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2134  if (TailBB->succ_size() == 1)
2135  Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
2137  else if (TailBB->succ_size() == 2) {
2138  auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2139  auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2140  auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2141  ? TailBBFreq * TailToHeadProb.getCompl()
2142  : TailToHeadFreq;
2143  Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2144  ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2145  }
2146  }
2147 
2148  DEBUG(dbgs() << "The cost of loop rotation by making " << getBlockName(*Iter)
2149  << " to the top: " << Cost.getFrequency() << "\n");
2150 
2151  if (Cost < SmallestRotationCost) {
2152  SmallestRotationCost = Cost;
2153  RotationPos = Iter;
2154  }
2155  }
2156 
2157  if (RotationPos != LoopChain.end()) {
2158  DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2159  << " to the top\n");
2160  std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2161  }
2162 }
2163 
2164 /// \brief Collect blocks in the given loop that are to be placed.
2165 ///
2166 /// When profile data is available, exclude cold blocks from the returned set;
2167 /// otherwise, collect all blocks in the loop.
2169 MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2170  BlockFilterSet LoopBlockSet;
2171 
2172  // Filter cold blocks off from LoopBlockSet when profile data is available.
2173  // Collect the sum of frequencies of incoming edges to the loop header from
2174  // outside. If we treat the loop as a super block, this is the frequency of
2175  // the loop. Then for each block in the loop, we calculate the ratio between
2176  // its frequency and the frequency of the loop block. When it is too small,
2177  // don't add it to the loop chain. If there are outer loops, then this block
2178  // will be merged into the first outer loop chain for which this block is not
2179  // cold anymore. This needs precise profile data and we only do this when
2180  // profile data is available.
2181  if (F->getFunction()->getEntryCount() || ForceLoopColdBlock) {
2182  BlockFrequency LoopFreq(0);
2183  for (auto LoopPred : L.getHeader()->predecessors())
2184  if (!L.contains(LoopPred))
2185  LoopFreq += MBFI->getBlockFreq(LoopPred) *
2186  MBPI->getEdgeProbability(LoopPred, L.getHeader());
2187 
2188  for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2189  auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2190  if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2191  continue;
2192  LoopBlockSet.insert(LoopBB);
2193  }
2194  } else
2195  LoopBlockSet.insert(L.block_begin(), L.block_end());
2196 
2197  return LoopBlockSet;
2198 }
2199 
2200 /// \brief Forms basic block chains from the natural loop structures.
2201 ///
2202 /// These chains are designed to preserve the existing *structure* of the code
2203 /// as much as possible. We can then stitch the chains together in a way which
2204 /// both preserves the topological structure and minimizes taken conditional
2205 /// branches.
2206 void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2207  // First recurse through any nested loops, building chains for those inner
2208  // loops.
2209  for (const MachineLoop *InnerLoop : L)
2210  buildLoopChains(*InnerLoop);
2211 
2212  assert(BlockWorkList.empty() &&
2213  "BlockWorkList not empty when starting to build loop chains.");
2214  assert(EHPadWorkList.empty() &&
2215  "EHPadWorkList not empty when starting to build loop chains.");
2216  BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2217 
2218  // Check if we have profile data for this function. If yes, we will rotate
2219  // this loop by modeling costs more precisely which requires the profile data
2220  // for better layout.
2221  bool RotateLoopWithProfile =
2224 
2225  // First check to see if there is an obviously preferable top block for the
2226  // loop. This will default to the header, but may end up as one of the
2227  // predecessors to the header if there is one which will result in strictly
2228  // fewer branches in the loop body.
2229  // When we use profile data to rotate the loop, this is unnecessary.
2230  MachineBasicBlock *LoopTop =
2231  RotateLoopWithProfile ? L.getHeader() : findBestLoopTop(L, LoopBlockSet);
2232 
2233  // If we selected just the header for the loop top, look for a potentially
2234  // profitable exit block in the event that rotating the loop can eliminate
2235  // branches by placing an exit edge at the bottom.
2236  //
2237  // Loops are processed innermost to uttermost, make sure we clear
2238  // PreferredLoopExit before processing a new loop.
2239  PreferredLoopExit = nullptr;
2240  if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2241  PreferredLoopExit = findBestLoopExit(L, LoopBlockSet);
2242 
2243  BlockChain &LoopChain = *BlockToChain[LoopTop];
2244 
2245  // FIXME: This is a really lame way of walking the chains in the loop: we
2246  // walk the blocks, and use a set to prevent visiting a particular chain
2247  // twice.
2248  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2249  assert(LoopChain.UnscheduledPredecessors == 0 &&
2250  "LoopChain should not have unscheduled predecessors.");
2251  UpdatedPreds.insert(&LoopChain);
2252 
2253  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2254  fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2255 
2256  buildChain(LoopTop, LoopChain, &LoopBlockSet);
2257 
2258  if (RotateLoopWithProfile)
2259  rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2260  else
2261  rotateLoop(LoopChain, PreferredLoopExit, LoopBlockSet);
2262 
2263  DEBUG({
2264  // Crash at the end so we get all of the debugging output first.
2265  bool BadLoop = false;
2266  if (LoopChain.UnscheduledPredecessors) {
2267  BadLoop = true;
2268  dbgs() << "Loop chain contains a block without its preds placed!\n"
2269  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2270  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2271  }
2272  for (MachineBasicBlock *ChainBB : LoopChain) {
2273  dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2274  if (!LoopBlockSet.remove(ChainBB)) {
2275  // We don't mark the loop as bad here because there are real situations
2276  // where this can occur. For example, with an unanalyzable fallthrough
2277  // from a loop block to a non-loop block or vice versa.
2278  dbgs() << "Loop chain contains a block not contained by the loop!\n"
2279  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2280  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2281  << " Bad block: " << getBlockName(ChainBB) << "\n";
2282  }
2283  }
2284 
2285  if (!LoopBlockSet.empty()) {
2286  BadLoop = true;
2287  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2288  dbgs() << "Loop contains blocks never placed into a chain!\n"
2289  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2290  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2291  << " Bad block: " << getBlockName(LoopBB) << "\n";
2292  }
2293  assert(!BadLoop && "Detected problems with the placement of this loop.");
2294  });
2295 
2296  BlockWorkList.clear();
2297  EHPadWorkList.clear();
2298 }
2299 
2300 void MachineBlockPlacement::buildCFGChains() {
2301  // Ensure that every BB in the function has an associated chain to simplify
2302  // the assumptions of the remaining algorithm.
2303  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2304  for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2305  ++FI) {
2306  MachineBasicBlock *BB = &*FI;
2307  BlockChain *Chain =
2308  new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2309  // Also, merge any blocks which we cannot reason about and must preserve
2310  // the exact fallthrough behavior for.
2311  while (true) {
2312  Cond.clear();
2313  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2314  if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2315  break;
2316 
2317  MachineFunction::iterator NextFI = std::next(FI);
2318  MachineBasicBlock *NextBB = &*NextFI;
2319  // Ensure that the layout successor is a viable block, as we know that
2320  // fallthrough is a possibility.
2321  assert(NextFI != FE && "Can't fallthrough past the last block.");
2322  DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2323  << getBlockName(BB) << " -> " << getBlockName(NextBB)
2324  << "\n");
2325  Chain->merge(NextBB, nullptr);
2326 #ifndef NDEBUG
2327  BlocksWithUnanalyzableExits.insert(&*BB);
2328 #endif
2329  FI = NextFI;
2330  BB = NextBB;
2331  }
2332  }
2333 
2334  // Build any loop-based chains.
2335  PreferredLoopExit = nullptr;
2336  for (MachineLoop *L : *MLI)
2337  buildLoopChains(*L);
2338 
2339  assert(BlockWorkList.empty() &&
2340  "BlockWorkList should be empty before building final chain.");
2341  assert(EHPadWorkList.empty() &&
2342  "EHPadWorkList should be empty before building final chain.");
2343 
2344  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2345  for (MachineBasicBlock &MBB : *F)
2346  fillWorkLists(&MBB, UpdatedPreds);
2347 
2348  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2349  buildChain(&F->front(), FunctionChain);
2350 
2351 #ifndef NDEBUG
2352  using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2353 #endif
2354  DEBUG({
2355  // Crash at the end so we get all of the debugging output first.
2356  bool BadFunc = false;
2357  FunctionBlockSetType FunctionBlockSet;
2358  for (MachineBasicBlock &MBB : *F)
2359  FunctionBlockSet.insert(&MBB);
2360 
2361  for (MachineBasicBlock *ChainBB : FunctionChain)
2362  if (!FunctionBlockSet.erase(ChainBB)) {
2363  BadFunc = true;
2364  dbgs() << "Function chain contains a block not in the function!\n"
2365  << " Bad block: " << getBlockName(ChainBB) << "\n";
2366  }
2367 
2368  if (!FunctionBlockSet.empty()) {
2369  BadFunc = true;
2370  for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2371  dbgs() << "Function contains blocks never placed into a chain!\n"
2372  << " Bad block: " << getBlockName(RemainingBB) << "\n";
2373  }
2374  assert(!BadFunc && "Detected problems with the block placement.");
2375  });
2376 
2377  // Splice the blocks into place.
2378  MachineFunction::iterator InsertPos = F->begin();
2379  DEBUG(dbgs() << "[MBP] Function: "<< F->getName() << "\n");
2380  for (MachineBasicBlock *ChainBB : FunctionChain) {
2381  DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2382  : " ... ")
2383  << getBlockName(ChainBB) << "\n");
2384  if (InsertPos != MachineFunction::iterator(ChainBB))
2385  F->splice(InsertPos, ChainBB);
2386  else
2387  ++InsertPos;
2388 
2389  // Update the terminator of the previous block.
2390  if (ChainBB == *FunctionChain.begin())
2391  continue;
2392  MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2393 
2394  // FIXME: It would be awesome of updateTerminator would just return rather
2395  // than assert when the branch cannot be analyzed in order to remove this
2396  // boiler plate.
2397  Cond.clear();
2398  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2399 
2400 #ifndef NDEBUG
2401  if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2402  // Given the exact block placement we chose, we may actually not _need_ to
2403  // be able to edit PrevBB's terminator sequence, but not being _able_ to
2404  // do that at this point is a bug.
2405  assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2406  !PrevBB->canFallThrough()) &&
2407  "Unexpected block with un-analyzable fallthrough!");
2408  Cond.clear();
2409  TBB = FBB = nullptr;
2410  }
2411 #endif
2412 
2413  // The "PrevBB" is not yet updated to reflect current code layout, so,
2414  // o. it may fall-through to a block without explicit "goto" instruction
2415  // before layout, and no longer fall-through it after layout; or
2416  // o. just opposite.
2417  //
2418  // analyzeBranch() may return erroneous value for FBB when these two
2419  // situations take place. For the first scenario FBB is mistakenly set NULL;
2420  // for the 2nd scenario, the FBB, which is expected to be NULL, is
2421  // mistakenly pointing to "*BI".
2422  // Thus, if the future change needs to use FBB before the layout is set, it
2423  // has to correct FBB first by using the code similar to the following:
2424  //
2425  // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2426  // PrevBB->updateTerminator();
2427  // Cond.clear();
2428  // TBB = FBB = nullptr;
2429  // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2430  // // FIXME: This should never take place.
2431  // TBB = FBB = nullptr;
2432  // }
2433  // }
2434  if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond))
2435  PrevBB->updateTerminator();
2436  }
2437 
2438  // Fixup the last block.
2439  Cond.clear();
2440  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2441  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond))
2442  F->back().updateTerminator();
2443 
2444  BlockWorkList.clear();
2445  EHPadWorkList.clear();
2446 }
2447 
2448 void MachineBlockPlacement::optimizeBranches() {
2449  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2450  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2451 
2452  // Now that all the basic blocks in the chain have the proper layout,
2453  // make a final call to AnalyzeBranch with AllowModify set.
2454  // Indeed, the target may be able to optimize the branches in a way we
2455  // cannot because all branches may not be analyzable.
2456  // E.g., the target may be able to remove an unconditional branch to
2457  // a fallthrough when it occurs after predicated terminators.
2458  for (MachineBasicBlock *ChainBB : FunctionChain) {
2459  Cond.clear();
2460  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2461  if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
2462  // If PrevBB has a two-way branch, try to re-order the branches
2463  // such that we branch to the successor with higher probability first.
2464  if (TBB && !Cond.empty() && FBB &&
2465  MBPI->getEdgeProbability(ChainBB, FBB) >
2466  MBPI->getEdgeProbability(ChainBB, TBB) &&
2467  !TII->reverseBranchCondition(Cond)) {
2468  DEBUG(dbgs() << "Reverse order of the two branches: "
2469  << getBlockName(ChainBB) << "\n");
2470  DEBUG(dbgs() << " Edge probability: "
2471  << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
2472  << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
2473  DebugLoc dl; // FIXME: this is nowhere
2474  TII->removeBranch(*ChainBB);
2475  TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
2476  ChainBB->updateTerminator();
2477  }
2478  }
2479  }
2480 }
2481 
2482 void MachineBlockPlacement::alignBlocks() {
2483  // Walk through the backedges of the function now that we have fully laid out
2484  // the basic blocks and align the destination of each backedge. We don't rely
2485  // exclusively on the loop info here so that we can align backedges in
2486  // unnatural CFGs and backedges that were introduced purely because of the
2487  // loop rotations done during this layout pass.
2488  if (F->getFunction()->optForSize())
2489  return;
2490  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2491  if (FunctionChain.begin() == FunctionChain.end())
2492  return; // Empty chain.
2493 
2494  const BranchProbability ColdProb(1, 5); // 20%
2495  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2496  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2497  for (MachineBasicBlock *ChainBB : FunctionChain) {
2498  if (ChainBB == *FunctionChain.begin())
2499  continue;
2500 
2501  // Don't align non-looping basic blocks. These are unlikely to execute
2502  // enough times to matter in practice. Note that we'll still handle
2503  // unnatural CFGs inside of a natural outer loop (the common case) and
2504  // rotated loops.
2505  MachineLoop *L = MLI->getLoopFor(ChainBB);
2506  if (!L)
2507  continue;
2508 
2509  unsigned Align = TLI->getPrefLoopAlignment(L);
2510  if (!Align)
2511  continue; // Don't care about loop alignment.
2512 
2513  // If the block is cold relative to the function entry don't waste space
2514  // aligning it.
2515  BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
2516  if (Freq < WeightedEntryFreq)
2517  continue;
2518 
2519  // If the block is cold relative to its loop header, don't align it
2520  // regardless of what edges into the block exist.
2521  MachineBasicBlock *LoopHeader = L->getHeader();
2522  BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
2523  if (Freq < (LoopHeaderFreq * ColdProb))
2524  continue;
2525 
2526  // Check for the existence of a non-layout predecessor which would benefit
2527  // from aligning this block.
2528  MachineBasicBlock *LayoutPred =
2529  &*std::prev(MachineFunction::iterator(ChainBB));
2530 
2531  // Force alignment if all the predecessors are jumps. We already checked
2532  // that the block isn't cold above.
2533  if (!LayoutPred->isSuccessor(ChainBB)) {
2534  ChainBB->setAlignment(Align);
2535  continue;
2536  }
2537 
2538  // Align this block if the layout predecessor's edge into this block is
2539  // cold relative to the block. When this is true, other predecessors make up
2540  // all of the hot entries into the block and thus alignment is likely to be
2541  // important.
2542  BranchProbability LayoutProb =
2543  MBPI->getEdgeProbability(LayoutPred, ChainBB);
2544  BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
2545  if (LayoutEdgeFreq <= (Freq * ColdProb))
2546  ChainBB->setAlignment(Align);
2547  }
2548 }
2549 
2550 /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
2551 /// it was duplicated into its chain predecessor and removed.
2552 /// \p BB - Basic block that may be duplicated.
2553 ///
2554 /// \p LPred - Chosen layout predecessor of \p BB.
2555 /// Updated to be the chain end if LPred is removed.
2556 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2557 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2558 /// Used to identify which blocks to update predecessor
2559 /// counts.
2560 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2561 /// chosen in the given order due to unnatural CFG
2562 /// only needed if \p BB is removed and
2563 /// \p PrevUnplacedBlockIt pointed to \p BB.
2564 /// @return true if \p BB was removed.
2565 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
2566  MachineBasicBlock *BB, MachineBasicBlock *&LPred,
2567  const MachineBasicBlock *LoopHeaderBB,
2568  BlockChain &Chain, BlockFilterSet *BlockFilter,
2569  MachineFunction::iterator &PrevUnplacedBlockIt) {
2570  bool Removed, DuplicatedToLPred;
2571  bool DuplicatedToOriginalLPred;
2572  Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
2573  PrevUnplacedBlockIt,
2574  DuplicatedToLPred);
2575  if (!Removed)
2576  return false;
2577  DuplicatedToOriginalLPred = DuplicatedToLPred;
2578  // Iteratively try to duplicate again. It can happen that a block that is
2579  // duplicated into is still small enough to be duplicated again.
2580  // No need to call markBlockSuccessors in this case, as the blocks being
2581  // duplicated from here on are already scheduled.
2582  // Note that DuplicatedToLPred always implies Removed.
2583  while (DuplicatedToLPred) {
2584  assert(Removed && "Block must have been removed to be duplicated into its "
2585  "layout predecessor.");
2586  MachineBasicBlock *DupBB, *DupPred;
2587  // The removal callback causes Chain.end() to be updated when a block is
2588  // removed. On the first pass through the loop, the chain end should be the
2589  // same as it was on function entry. On subsequent passes, because we are
2590  // duplicating the block at the end of the chain, if it is removed the
2591  // chain will have shrunk by one block.
2592  BlockChain::iterator ChainEnd = Chain.end();
2593  DupBB = *(--ChainEnd);
2594  // Now try to duplicate again.
2595  if (ChainEnd == Chain.begin())
2596  break;
2597  DupPred = *std::prev(ChainEnd);
2598  Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
2599  PrevUnplacedBlockIt,
2600  DuplicatedToLPred);
2601  }
2602  // If BB was duplicated into LPred, it is now scheduled. But because it was
2603  // removed, markChainSuccessors won't be called for its chain. Instead we
2604  // call markBlockSuccessors for LPred to achieve the same effect. This must go
2605  // at the end because repeating the tail duplication can increase the number
2606  // of unscheduled predecessors.
2607  LPred = *std::prev(Chain.end());
2608  if (DuplicatedToOriginalLPred)
2609  markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
2610  return true;
2611 }
2612 
2613 /// Tail duplicate \p BB into (some) predecessors if profitable.
2614 /// \p BB - Basic block that may be duplicated
2615 /// \p LPred - Chosen layout predecessor of \p BB
2616 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2617 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2618 /// Used to identify which blocks to update predecessor
2619 /// counts.
2620 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2621 /// chosen in the given order due to unnatural CFG
2622 /// only needed if \p BB is removed and
2623 /// \p PrevUnplacedBlockIt pointed to \p BB.
2624 /// \p DuplicatedToLPred - True if the block was duplicated into LPred. Will
2625 /// only be true if the block was removed.
2626 /// \return - True if the block was duplicated into all preds and removed.
2627 bool MachineBlockPlacement::maybeTailDuplicateBlock(
2629  BlockChain &Chain, BlockFilterSet *BlockFilter,
2630  MachineFunction::iterator &PrevUnplacedBlockIt,
2631  bool &DuplicatedToLPred) {
2632  DuplicatedToLPred = false;
2633  if (!shouldTailDuplicate(BB))
2634  return false;
2635 
2636  DEBUG(dbgs() << "Redoing tail duplication for Succ#"
2637  << BB->getNumber() << "\n");
2638 
2639  // This has to be a callback because none of it can be done after
2640  // BB is deleted.
2641  bool Removed = false;
2642  auto RemovalCallback =
2643  [&](MachineBasicBlock *RemBB) {
2644  // Signal to outer function
2645  Removed = true;
2646 
2647  // Conservative default.
2648  bool InWorkList = true;
2649  // Remove from the Chain and Chain Map
2650  if (BlockToChain.count(RemBB)) {
2651  BlockChain *Chain = BlockToChain[RemBB];
2652  InWorkList = Chain->UnscheduledPredecessors == 0;
2653  Chain->remove(RemBB);
2654  BlockToChain.erase(RemBB);
2655  }
2656 
2657  // Handle the unplaced block iterator
2658  if (&(*PrevUnplacedBlockIt) == RemBB) {
2659  PrevUnplacedBlockIt++;
2660  }
2661 
2662  // Handle the Work Lists
2663  if (InWorkList) {
2664  SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
2665  if (RemBB->isEHPad())
2666  RemoveList = EHPadWorkList;
2667  RemoveList.erase(
2668  llvm::remove_if(RemoveList,
2669  [RemBB](MachineBasicBlock *BB) {
2670  return BB == RemBB;
2671  }),
2672  RemoveList.end());
2673  }
2674 
2675  // Handle the filter set
2676  if (BlockFilter) {
2677  BlockFilter->remove(RemBB);
2678  }
2679 
2680  // Remove the block from loop info.
2681  MLI->removeBlock(RemBB);
2682  if (RemBB == PreferredLoopExit)
2683  PreferredLoopExit = nullptr;
2684 
2685  DEBUG(dbgs() << "TailDuplicator deleted block: "
2686  << getBlockName(RemBB) << "\n");
2687  };
2688  auto RemovalCallbackRef =
2689  function_ref<void(MachineBasicBlock*)>(RemovalCallback);
2690 
2691  SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
2692  bool IsSimple = TailDup.isSimpleBB(BB);
2693  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
2694  &DuplicatedPreds, &RemovalCallbackRef);
2695 
2696  // Update UnscheduledPredecessors to reflect tail-duplication.
2697  DuplicatedToLPred = false;
2698  for (MachineBasicBlock *Pred : DuplicatedPreds) {
2699  // We're only looking for unscheduled predecessors that match the filter.
2700  BlockChain* PredChain = BlockToChain[Pred];
2701  if (Pred == LPred)
2702  DuplicatedToLPred = true;
2703  if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
2704  || PredChain == &Chain)
2705  continue;
2706  for (MachineBasicBlock *NewSucc : Pred->successors()) {
2707  if (BlockFilter && !BlockFilter->count(NewSucc))
2708  continue;
2709  BlockChain *NewChain = BlockToChain[NewSucc];
2710  if (NewChain != &Chain && NewChain != PredChain)
2711  NewChain->UnscheduledPredecessors++;
2712  }
2713  }
2714  return Removed;
2715 }
2716 
2717 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
2718  if (skipFunction(*MF.getFunction()))
2719  return false;
2720 
2721  // Check for single-block functions and skip them.
2722  if (std::next(MF.begin()) == MF.end())
2723  return false;
2724 
2725  F = &MF;
2726  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
2727  MBFI = llvm::make_unique<BranchFolder::MBFIWrapper>(
2728  getAnalysis<MachineBlockFrequencyInfo>());
2729  MLI = &getAnalysis<MachineLoopInfo>();
2730  TII = MF.getSubtarget().getInstrInfo();
2731  TLI = MF.getSubtarget().getTargetLowering();
2732  MPDT = nullptr;
2733 
2734  // Initialize PreferredLoopExit to nullptr here since it may never be set if
2735  // there are no MachineLoops.
2736  PreferredLoopExit = nullptr;
2737 
2738  assert(BlockToChain.empty() &&
2739  "BlockToChain map should be empty before starting placement.");
2740  assert(ComputedEdges.empty() &&
2741  "Computed Edge map should be empty before starting placement.");
2742 
2743  unsigned TailDupSize = TailDupPlacementThreshold;
2744  // If only the aggressive threshold is explicitly set, use it.
2745  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
2746  TailDupPlacementThreshold.getNumOccurrences() == 0)
2748 
2749  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
2750  // For agressive optimization, we can adjust some thresholds to be less
2751  // conservative.
2752  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
2753  // At O3 we should be more willing to copy blocks for tail duplication. This
2754  // increases size pressure, so we only do it at O3
2755  // Do this unless only the regular threshold is explicitly set.
2756  if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
2757  TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
2759  }
2760 
2761  if (TailDupPlacement) {
2762  MPDT = &getAnalysis<MachinePostDominatorTree>();
2763  if (MF.getFunction()->optForSize())
2764  TailDupSize = 1;
2765  bool PreRegAlloc = false;
2766  TailDup.initMF(MF, PreRegAlloc, MBPI, /* LayoutMode */ true, TailDupSize);
2767  precomputeTriangleChains();
2768  }
2769 
2770  buildCFGChains();
2771 
2772  // Changing the layout can create new tail merging opportunities.
2773  // TailMerge can create jump into if branches that make CFG irreducible for
2774  // HW that requires structured CFG.
2775  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
2776  PassConfig->getEnableTailMerge() &&
2778  // No tail merging opportunities if the block number is less than four.
2779  if (MF.size() > 3 && EnableTailMerge) {
2780  unsigned TailMergeSize = TailDupSize + 1;
2781  BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
2782  *MBPI, TailMergeSize);
2783 
2784  if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
2785  getAnalysisIfAvailable<MachineModuleInfo>(), MLI,
2786  /*AfterBlockPlacement=*/true)) {
2787  // Redo the layout if tail merging creates/removes/moves blocks.
2788  BlockToChain.clear();
2789  ComputedEdges.clear();
2790  // Must redo the post-dominator tree if blocks were changed.
2791  if (MPDT)
2792  MPDT->runOnMachineFunction(MF);
2793  ChainAllocator.DestroyAll();
2794  buildCFGChains();
2795  }
2796  }
2797 
2798  optimizeBranches();
2799  alignBlocks();
2800 
2801  BlockToChain.clear();
2802  ComputedEdges.clear();
2803  ChainAllocator.DestroyAll();
2804 
2805  if (AlignAllBlock)
2806  // Align all of the blocks in the function to a specific alignment.
2807  for (MachineBasicBlock &MBB : MF)
2809  else if (AlignAllNonFallThruBlocks) {
2810  // Align all of the blocks that have no fall-through predecessors to a
2811  // specific alignment.
2812  for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
2813  auto LayoutPred = std::prev(MBI);
2814  if (!LayoutPred->isSuccessor(&*MBI))
2815  MBI->setAlignment(AlignAllNonFallThruBlocks);
2816  }
2817  }
2818  if (ViewBlockLayoutWithBFI != GVDT_None &&
2819  (ViewBlockFreqFuncName.empty() ||
2820  F->getFunction()->getName().equals(ViewBlockFreqFuncName))) {
2821  MBFI->view("MBP." + MF.getName(), false);
2822  }
2823 
2824 
2825  // We always return true as we have no way to track whether the final order
2826  // differs from the original order.
2827  return true;
2828 }
2829 
2830 namespace {
2831 
2832 /// \brief A pass to compute block placement statistics.
2833 ///
2834 /// A separate pass to compute interesting statistics for evaluating block
2835 /// placement. This is separate from the actual placement pass so that they can
2836 /// be computed in the absence of any placement transformations or when using
2837 /// alternative placement strategies.
2838 class MachineBlockPlacementStats : public MachineFunctionPass {
2839  /// \brief A handle to the branch probability pass.
2840  const MachineBranchProbabilityInfo *MBPI;
2841 
2842  /// \brief A handle to the function-wide block frequency pass.
2843  const MachineBlockFrequencyInfo *MBFI;
2844 
2845 public:
2846  static char ID; // Pass identification, replacement for typeid
2847 
2848  MachineBlockPlacementStats() : MachineFunctionPass(ID) {
2850  }
2851 
2852  bool runOnMachineFunction(MachineFunction &F) override;
2853 
2854  void getAnalysisUsage(AnalysisUsage &AU) const override {
2857  AU.setPreservesAll();
2859  }
2860 };
2861 
2862 } // end anonymous namespace
2863 
2865 
2867 
2868 INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
2869  "Basic Block Placement Stats", false, false)
2872 INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
2873  "Basic Block Placement Stats", false, false)
2874 
2875 bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
2876  // Check for single-block functions and skip them.
2877  if (std::next(F.begin()) == F.end())
2878  return false;
2879 
2880  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
2881  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
2882 
2883  for (MachineBasicBlock &MBB : F) {
2884  BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
2885  Statistic &NumBranches =
2886  (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
2887  Statistic &BranchTakenFreq =
2888  (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
2889  for (MachineBasicBlock *Succ : MBB.successors()) {
2890  // Skip if this successor is a fallthrough.
2891  if (MBB.isLayoutSuccessor(Succ))
2892  continue;
2893 
2894  BlockFrequency EdgeFreq =
2895  BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
2896  ++NumBranches;
2897  BranchTakenFreq += EdgeFreq.getFrequency();
2898  }
2899  }
2900 
2901  return false;
2902 }
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:244
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
BranchProbability getCompl() const
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:235
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:329
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:449
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
LLVM_ATTRIBUTE_ALWAYS_INLINE size_type size() const
Definition: SmallVector.h:136
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
unsigned size() const
bool requiresStructuredCFG() const
virtual const TargetLowering * getTargetLowering() const
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
virtual unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const
Insert branch code into the end of the specified MachineBasicBlock.
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLExtras.h:89
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
STATISTIC(NumFunctions, "Total number of functions")
A debug info location.
Definition: DebugLoc.h:34
F(f)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static BranchProbability getOne()
virtual unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const
Remove the branching code at the end of the specific MBB.
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
void setAlignment(unsigned Align)
Set alignment of the basic block.
CodeGenOpt::Level getOptLevel() const
This file defines the MallocAllocator and BumpPtrAllocator interfaces.
iterator_range< succ_iterator > successors()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:344
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:191
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:465
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, const TargetRegisterInfo *tri, MachineModuleInfo *mmi, MachineLoopInfo *mli=nullptr, bool AfterPlacement=false)
Perhaps branch folding, tail merging and other CFG optimizations on the given function.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
void removeBlock(MachineBasicBlock *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
block placement Basic Block Placement Stats
Target-Independent Code Generator Pass Configuration Options.
BlockT * getHeader() const
Definition: LoopInfo.h:100
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:232
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:406
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
virtual const TargetInstrInfo * getInstrInfo() const
cl::opt< std::string > ViewBlockFreqFuncName
auto count(R &&Range, const E &Element) -> typename std::iterator_traits< decltype(adl_begin(Range))>::difference_type
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:874
TargetInstrInfo - Interface to description of machine instruction set.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
cl::opt< unsigned > ProfileLikelyProb
bool getEnableTailMerge() const
#define P(N)
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock *> *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:406
cl::opt< GVDAGType > ViewBlockLayoutWithBFI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:291
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:92
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:116
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
virtual unsigned getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Represent the analysis usage information of a pass.
bool optForSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:530
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, uint64_t EntryFreq)
Compare 2 BlockFrequency&#39;s with a small penalty for A.
iterator_range< pred_iterator > predecessors()
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:434
auto remove_if(R &&Range, UnaryPredicate P) -> decltype(adl_begin(Range))
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:853
Optional< uint64_t > getEntryCount() const
Get the entry count for this function.
Definition: Function.cpp:1330
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e...
iterator erase(const_iterator CI)
Definition: SmallVector.h:449
const MachineBasicBlock & front() const
void initializeMachineBlockPlacementPass(PassRegistry &)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:834
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
Branch Probability Basic Block Placement
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
size_type size() const
Definition: SmallPtrSet.h:93
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:110
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:298
Iterator for intrusive lists based on ilist_node.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:418
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
loop rotate
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:864
void updateTerminator()
Update the terminator instructions in block to account for changes to the layout. ...
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_END(MachineBlockPlacement
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:385
unsigned pred_size() const
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:923
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
void setPreservesAll()
Set by analyses that do not transform their input at all.
typename SuperClass::iterator iterator
Definition: SmallVector.h:328
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:169
unsigned succ_size() const
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
cl::opt< unsigned > StaticLikelyProb
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:163
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all " "blocks in the function."), cl::init(0), cl::Hidden)
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Definition: SmallVector.h:120
static uint64_t getMaxFrequency()
Returns the maximum possible frequency, the saturation value.
void emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:656
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:61
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:149
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:220
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock *> &Successors)
Check if BB has exactly the successors in Successors.
#define I(x, y, z)
Definition: MD5.cpp:58
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
block_iterator block_end() const
Definition: LoopInfo.h:155
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:141
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:181
const Function * getFunction() const
getFunction - Return the LLVM function that this machine code represents
virtual bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const
Reverses the branch condition of the specified condition list, returning false on success and true if...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:462
#define DEBUG_TYPE
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
#define DEBUG(X)
Definition: Debug.h:118
static BranchProbability getZero()
Utility class to perform tail duplication.
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
for(unsigned i=Desc.getNumOperands(), e=OldMI.getNumOperands();i !=e;++i)
block_iterator block_begin() const
Definition: LoopInfo.h:154
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all " "blocks that have no fall-through predecessors (i.e. don't add " "nops that are executed)."), cl::init(0), cl::Hidden)
uint32_t getNumerator() const
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
This file describes how to lower LLVM code to machine code.