LLVM  9.0.0svn
MachineBlockPlacement.cpp
Go to the documentation of this file.
1 //===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements basic block placement transformations using the CFG
10 // structure and branch probability estimates.
11 //
12 // The pass strives to preserve the structure of the CFG (that is, retain
13 // a topological ordering of basic blocks) in the absence of a *strong* signal
14 // to the contrary from probabilities. However, within the CFG structure, it
15 // attempts to choose an ordering which favors placing more likely sequences of
16 // blocks adjacent to each other.
17 //
18 // The algorithm works from the inner-most loop within a function outward, and
19 // at each stage walks through the basic blocks, trying to coalesce them into
20 // sequential chains where allowed by the CFG (or demanded by heavy
21 // probabilities). Finally, it walks the blocks in topological order, and the
22 // first time it reaches a chain of basic blocks, it schedules them in the
23 // function in-order.
24 //
25 //===----------------------------------------------------------------------===//
26 
27 #include "BranchFolding.h"
28 #include "llvm/ADT/ArrayRef.h"
29 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/SetVector.h"
32 #include "llvm/ADT/SmallPtrSet.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
49 #include "llvm/IR/DebugLoc.h"
50 #include "llvm/IR/Function.h"
51 #include "llvm/Pass.h"
52 #include "llvm/Support/Allocator.h"
55 #include "llvm/Support/CodeGen.h"
57 #include "llvm/Support/Compiler.h"
58 #include "llvm/Support/Debug.h"
61 #include <algorithm>
62 #include <cassert>
63 #include <cstdint>
64 #include <iterator>
65 #include <memory>
66 #include <string>
67 #include <tuple>
68 #include <utility>
69 #include <vector>
70 
71 using namespace llvm;
72 
73 #define DEBUG_TYPE "block-placement"
74 
75 STATISTIC(NumCondBranches, "Number of conditional branches");
76 STATISTIC(NumUncondBranches, "Number of unconditional branches");
77 STATISTIC(CondBranchTakenFreq,
78  "Potential frequency of taking conditional branches");
79 STATISTIC(UncondBranchTakenFreq,
80  "Potential frequency of taking unconditional branches");
81 
82 static cl::opt<unsigned> AlignAllBlock("align-all-blocks",
83  cl::desc("Force the alignment of all "
84  "blocks in the function."),
85  cl::init(0), cl::Hidden);
86 
88  "align-all-nofallthru-blocks",
89  cl::desc("Force the alignment of all "
90  "blocks that have no fall-through predecessors (i.e. don't add "
91  "nops that are executed)."),
92  cl::init(0), cl::Hidden);
93 
94 // FIXME: Find a good default for this flag and remove the flag.
96  "block-placement-exit-block-bias",
97  cl::desc("Block frequency percentage a loop exit block needs "
98  "over the original exit to be considered the new exit."),
99  cl::init(0), cl::Hidden);
100 
101 // Definition:
102 // - Outlining: placement of a basic block outside the chain or hot path.
103 
105  "loop-to-cold-block-ratio",
106  cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
107  "(frequency of block) is greater than this ratio"),
108  cl::init(5), cl::Hidden);
109 
111  "force-loop-cold-block",
112  cl::desc("Force outlining cold blocks from loops."),
113  cl::init(false), cl::Hidden);
114 
115 static cl::opt<bool>
116  PreciseRotationCost("precise-rotation-cost",
117  cl::desc("Model the cost of loop rotation more "
118  "precisely by using profile data."),
119  cl::init(false), cl::Hidden);
120 
121 static cl::opt<bool>
122  ForcePreciseRotationCost("force-precise-rotation-cost",
123  cl::desc("Force the use of precise cost "
124  "loop rotation strategy."),
125  cl::init(false), cl::Hidden);
126 
128  "misfetch-cost",
129  cl::desc("Cost that models the probabilistic risk of an instruction "
130  "misfetch due to a jump comparing to falling through, whose cost "
131  "is zero."),
132  cl::init(1), cl::Hidden);
133 
134 static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
135  cl::desc("Cost of jump instructions."),
136  cl::init(1), cl::Hidden);
137 static cl::opt<bool>
138 TailDupPlacement("tail-dup-placement",
139  cl::desc("Perform tail duplication during placement. "
140  "Creates more fallthrough opportunites in "
141  "outline branches."),
142  cl::init(true), cl::Hidden);
143 
144 static cl::opt<bool>
145 BranchFoldPlacement("branch-fold-placement",
146  cl::desc("Perform branch folding during placement. "
147  "Reduces code size."),
148  cl::init(true), cl::Hidden);
149 
150 // Heuristic for tail duplication.
152  "tail-dup-placement-threshold",
153  cl::desc("Instruction cutoff for tail duplication during layout. "
154  "Tail merging during layout is forced to have a threshold "
155  "that won't conflict."), cl::init(2),
156  cl::Hidden);
157 
158 // Heuristic for aggressive tail duplication.
160  "tail-dup-placement-aggressive-threshold",
161  cl::desc("Instruction cutoff for aggressive tail duplication during "
162  "layout. Used at -O3. Tail merging during layout is forced to "
163  "have a threshold that won't conflict."), cl::init(4),
164  cl::Hidden);
165 
166 // Heuristic for tail duplication.
168  "tail-dup-placement-penalty",
169  cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
170  "Copying can increase fallthrough, but it also increases icache "
171  "pressure. This parameter controls the penalty to account for that. "
172  "Percent as integer."),
173  cl::init(2),
174  cl::Hidden);
175 
176 // Heuristic for triangle chains.
178  "triangle-chain-count",
179  cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
180  "triangle tail duplication heuristic to kick in. 0 to disable."),
181  cl::init(2),
182  cl::Hidden);
183 
186 
187 // Internal option used to control BFI display only after MBP pass.
188 // Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
189 // -view-block-layout-with-bfi=
191 
192 // Command line option to specify the name of the function for CFG dump
193 // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
195 
196 namespace {
197 
198 class BlockChain;
199 
200 /// Type for our function-wide basic block -> block chain mapping.
201 using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
202 
203 /// A chain of blocks which will be laid out contiguously.
204 ///
205 /// This is the datastructure representing a chain of consecutive blocks that
206 /// are profitable to layout together in order to maximize fallthrough
207 /// probabilities and code locality. We also can use a block chain to represent
208 /// a sequence of basic blocks which have some external (correctness)
209 /// requirement for sequential layout.
210 ///
211 /// Chains can be built around a single basic block and can be merged to grow
212 /// them. They participate in a block-to-chain mapping, which is updated
213 /// automatically as chains are merged together.
214 class BlockChain {
215  /// The sequence of blocks belonging to this chain.
216  ///
217  /// This is the sequence of blocks for a particular chain. These will be laid
218  /// out in-order within the function.
220 
221  /// A handle to the function-wide basic block to block chain mapping.
222  ///
223  /// This is retained in each block chain to simplify the computation of child
224  /// block chains for SCC-formation and iteration. We store the edges to child
225  /// basic blocks, and map them back to their associated chains using this
226  /// structure.
227  BlockToChainMapType &BlockToChain;
228 
229 public:
230  /// Construct a new BlockChain.
231  ///
232  /// This builds a new block chain representing a single basic block in the
233  /// function. It also registers itself as the chain that block participates
234  /// in with the BlockToChain mapping.
235  BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
236  : Blocks(1, BB), BlockToChain(BlockToChain) {
237  assert(BB && "Cannot create a chain with a null basic block");
238  BlockToChain[BB] = this;
239  }
240 
241  /// Iterator over blocks within the chain.
244 
245  /// Beginning of blocks within the chain.
246  iterator begin() { return Blocks.begin(); }
247  const_iterator begin() const { return Blocks.begin(); }
248 
249  /// End of blocks within the chain.
250  iterator end() { return Blocks.end(); }
251  const_iterator end() const { return Blocks.end(); }
252 
253  bool remove(MachineBasicBlock* BB) {
254  for(iterator i = begin(); i != end(); ++i) {
255  if (*i == BB) {
256  Blocks.erase(i);
257  return true;
258  }
259  }
260  return false;
261  }
262 
263  /// Merge a block chain into this one.
264  ///
265  /// This routine merges a block chain into this one. It takes care of forming
266  /// a contiguous sequence of basic blocks, updating the edge list, and
267  /// updating the block -> chain mapping. It does not free or tear down the
268  /// old chain, but the old chain's block list is no longer valid.
269  void merge(MachineBasicBlock *BB, BlockChain *Chain) {
270  assert(BB && "Can't merge a null block.");
271  assert(!Blocks.empty() && "Can't merge into an empty chain.");
272 
273  // Fast path in case we don't have a chain already.
274  if (!Chain) {
275  assert(!BlockToChain[BB] &&
276  "Passed chain is null, but BB has entry in BlockToChain.");
277  Blocks.push_back(BB);
278  BlockToChain[BB] = this;
279  return;
280  }
281 
282  assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
283  assert(Chain->begin() != Chain->end());
284 
285  // Update the incoming blocks to point to this chain, and add them to the
286  // chain structure.
287  for (MachineBasicBlock *ChainBB : *Chain) {
288  Blocks.push_back(ChainBB);
289  assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
290  BlockToChain[ChainBB] = this;
291  }
292  }
293 
294 #ifndef NDEBUG
295  /// Dump the blocks in this chain.
296  LLVM_DUMP_METHOD void dump() {
297  for (MachineBasicBlock *MBB : *this)
298  MBB->dump();
299  }
300 #endif // NDEBUG
301 
302  /// Count of predecessors of any block within the chain which have not
303  /// yet been scheduled. In general, we will delay scheduling this chain
304  /// until those predecessors are scheduled (or we find a sufficiently good
305  /// reason to override this heuristic.) Note that when forming loop chains,
306  /// blocks outside the loop are ignored and treated as if they were already
307  /// scheduled.
308  ///
309  /// Note: This field is reinitialized multiple times - once for each loop,
310  /// and then once for the function as a whole.
311  unsigned UnscheduledPredecessors = 0;
312 };
313 
314 class MachineBlockPlacement : public MachineFunctionPass {
315  /// A type for a block filter set.
316  using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
317 
318  /// Pair struct containing basic block and taildup profitability
319  struct BlockAndTailDupResult {
320  MachineBasicBlock *BB;
321  bool ShouldTailDup;
322  };
323 
324  /// Triple struct containing edge weight and the edge.
325  struct WeightedEdge {
326  BlockFrequency Weight;
327  MachineBasicBlock *Src;
328  MachineBasicBlock *Dest;
329  };
330 
331  /// work lists of blocks that are ready to be laid out
334 
335  /// Edges that have already been computed as optimal.
337 
338  /// Machine Function
340 
341  /// A handle to the branch probability pass.
342  const MachineBranchProbabilityInfo *MBPI;
343 
344  /// A handle to the function-wide block frequency pass.
345  std::unique_ptr<BranchFolder::MBFIWrapper> MBFI;
346 
347  /// A handle to the loop info.
348  MachineLoopInfo *MLI;
349 
350  /// Preferred loop exit.
351  /// Member variable for convenience. It may be removed by duplication deep
352  /// in the call stack.
353  MachineBasicBlock *PreferredLoopExit;
354 
355  /// A handle to the target's instruction info.
356  const TargetInstrInfo *TII;
357 
358  /// A handle to the target's lowering info.
359  const TargetLoweringBase *TLI;
360 
361  /// A handle to the post dominator tree.
363 
364  /// Duplicator used to duplicate tails during placement.
365  ///
366  /// Placement decisions can open up new tail duplication opportunities, but
367  /// since tail duplication affects placement decisions of later blocks, it
368  /// must be done inline.
369  TailDuplicator TailDup;
370 
371  /// Allocator and owner of BlockChain structures.
372  ///
373  /// We build BlockChains lazily while processing the loop structure of
374  /// a function. To reduce malloc traffic, we allocate them using this
375  /// slab-like allocator, and destroy them after the pass completes. An
376  /// important guarantee is that this allocator produces stable pointers to
377  /// the chains.
379 
380  /// Function wide BasicBlock to BlockChain mapping.
381  ///
382  /// This mapping allows efficiently moving from any given basic block to the
383  /// BlockChain it participates in, if any. We use it to, among other things,
384  /// allow implicitly defining edges between chains as the existing edges
385  /// between basic blocks.
387 
388 #ifndef NDEBUG
389  /// The set of basic blocks that have terminators that cannot be fully
390  /// analyzed. These basic blocks cannot be re-ordered safely by
391  /// MachineBlockPlacement, and we must preserve physical layout of these
392  /// blocks and their successors through the pass.
393  SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
394 #endif
395 
396  /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
397  /// if the count goes to 0, add them to the appropriate work list.
398  void markChainSuccessors(
399  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
400  const BlockFilterSet *BlockFilter = nullptr);
401 
402  /// Decrease the UnscheduledPredecessors count for a single block, and
403  /// if the count goes to 0, add them to the appropriate work list.
404  void markBlockSuccessors(
405  const BlockChain &Chain, const MachineBasicBlock *BB,
406  const MachineBasicBlock *LoopHeaderBB,
407  const BlockFilterSet *BlockFilter = nullptr);
408 
410  collectViableSuccessors(
411  const MachineBasicBlock *BB, const BlockChain &Chain,
412  const BlockFilterSet *BlockFilter,
414  bool shouldPredBlockBeOutlined(
415  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
416  const BlockChain &Chain, const BlockFilterSet *BlockFilter,
417  BranchProbability SuccProb, BranchProbability HotProb);
418  bool repeatedlyTailDuplicateBlock(
420  const MachineBasicBlock *LoopHeaderBB,
421  BlockChain &Chain, BlockFilterSet *BlockFilter,
422  MachineFunction::iterator &PrevUnplacedBlockIt);
423  bool maybeTailDuplicateBlock(
425  BlockChain &Chain, BlockFilterSet *BlockFilter,
426  MachineFunction::iterator &PrevUnplacedBlockIt,
427  bool &DuplicatedToLPred);
428  bool hasBetterLayoutPredecessor(
429  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
430  const BlockChain &SuccChain, BranchProbability SuccProb,
431  BranchProbability RealSuccProb, const BlockChain &Chain,
432  const BlockFilterSet *BlockFilter);
433  BlockAndTailDupResult selectBestSuccessor(
434  const MachineBasicBlock *BB, const BlockChain &Chain,
435  const BlockFilterSet *BlockFilter);
436  MachineBasicBlock *selectBestCandidateBlock(
437  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
438  MachineBasicBlock *getFirstUnplacedBlock(
439  const BlockChain &PlacedChain,
440  MachineFunction::iterator &PrevUnplacedBlockIt,
441  const BlockFilterSet *BlockFilter);
442 
443  /// Add a basic block to the work list if it is appropriate.
444  ///
445  /// If the optional parameter BlockFilter is provided, only MBB
446  /// present in the set will be added to the worklist. If nullptr
447  /// is provided, no filtering occurs.
448  void fillWorkLists(const MachineBasicBlock *MBB,
449  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
450  const BlockFilterSet *BlockFilter);
451 
452  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
453  BlockFilterSet *BlockFilter = nullptr);
454  bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
455  const MachineBasicBlock *OldTop);
456  MachineBasicBlock *findBestLoopTop(
457  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
458  MachineBasicBlock *findBestLoopExit(
459  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
460  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
461  void buildLoopChains(const MachineLoop &L);
462  void rotateLoop(
463  BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
464  const BlockFilterSet &LoopBlockSet);
465  void rotateLoopWithProfile(
466  BlockChain &LoopChain, const MachineLoop &L,
467  const BlockFilterSet &LoopBlockSet);
468  void buildCFGChains();
469  void optimizeBranches();
470  void alignBlocks();
471  /// Returns true if a block should be tail-duplicated to increase fallthrough
472  /// opportunities.
473  bool shouldTailDuplicate(MachineBasicBlock *BB);
474  /// Check the edge frequencies to see if tail duplication will increase
475  /// fallthroughs.
476  bool isProfitableToTailDup(
477  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
478  BranchProbability QProb,
479  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
480 
481  /// Check for a trellis layout.
482  bool isTrellis(const MachineBasicBlock *BB,
483  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
484  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
485 
486  /// Get the best successor given a trellis layout.
487  BlockAndTailDupResult getBestTrellisSuccessor(
488  const MachineBasicBlock *BB,
489  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
490  BranchProbability AdjustedSumProb, const BlockChain &Chain,
491  const BlockFilterSet *BlockFilter);
492 
493  /// Get the best pair of non-conflicting edges.
494  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
495  const MachineBasicBlock *BB,
497 
498  /// Returns true if a block can tail duplicate into all unplaced
499  /// predecessors. Filters based on loop.
500  bool canTailDuplicateUnplacedPreds(
501  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
502  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
503 
504  /// Find chains of triangles to tail-duplicate where a global analysis works,
505  /// but a local analysis would not find them.
506  void precomputeTriangleChains();
507 
508 public:
509  static char ID; // Pass identification, replacement for typeid
510 
511  MachineBlockPlacement() : MachineFunctionPass(ID) {
513  }
514 
515  bool runOnMachineFunction(MachineFunction &F) override;
516 
517  bool allowTailDupPlacement() const {
518  assert(F);
520  }
521 
522  void getAnalysisUsage(AnalysisUsage &AU) const override {
525  if (TailDupPlacement)
530  }
531 };
532 
533 } // end anonymous namespace
534 
536 
538 
539 INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
540  "Branch Probability Basic Block Placement", false, false)
545 INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
546  "Branch Probability Basic Block Placement", false, false)
547 
548 #ifndef NDEBUG
549 /// Helper to print the name of a MBB.
550 ///
551 /// Only used by debug logging.
552 static std::string getBlockName(const MachineBasicBlock *BB) {
553  std::string Result;
554  raw_string_ostream OS(Result);
555  OS << printMBBReference(*BB);
556  OS << " ('" << BB->getName() << "')";
557  OS.flush();
558  return Result;
559 }
560 #endif
561 
562 /// Mark a chain's successors as having one fewer preds.
563 ///
564 /// When a chain is being merged into the "placed" chain, this routine will
565 /// quickly walk the successors of each block in the chain and mark them as
566 /// having one fewer active predecessor. It also adds any successors of this
567 /// chain which reach the zero-predecessor state to the appropriate worklist.
568 void MachineBlockPlacement::markChainSuccessors(
569  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
570  const BlockFilterSet *BlockFilter) {
571  // Walk all the blocks in this chain, marking their successors as having
572  // a predecessor placed.
573  for (MachineBasicBlock *MBB : Chain) {
574  markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
575  }
576 }
577 
578 /// Mark a single block's successors as having one fewer preds.
579 ///
580 /// Under normal circumstances, this is only called by markChainSuccessors,
581 /// but if a block that was to be placed is completely tail-duplicated away,
582 /// and was duplicated into the chain end, we need to redo markBlockSuccessors
583 /// for just that block.
584 void MachineBlockPlacement::markBlockSuccessors(
585  const BlockChain &Chain, const MachineBasicBlock *MBB,
586  const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
587  // Add any successors for which this is the only un-placed in-loop
588  // predecessor to the worklist as a viable candidate for CFG-neutral
589  // placement. No subsequent placement of this block will violate the CFG
590  // shape, so we get to use heuristics to choose a favorable placement.
591  for (MachineBasicBlock *Succ : MBB->successors()) {
592  if (BlockFilter && !BlockFilter->count(Succ))
593  continue;
594  BlockChain &SuccChain = *BlockToChain[Succ];
595  // Disregard edges within a fixed chain, or edges to the loop header.
596  if (&Chain == &SuccChain || Succ == LoopHeaderBB)
597  continue;
598 
599  // This is a cross-chain edge that is within the loop, so decrement the
600  // loop predecessor count of the destination chain.
601  if (SuccChain.UnscheduledPredecessors == 0 ||
602  --SuccChain.UnscheduledPredecessors > 0)
603  continue;
604 
605  auto *NewBB = *SuccChain.begin();
606  if (NewBB->isEHPad())
607  EHPadWorkList.push_back(NewBB);
608  else
609  BlockWorkList.push_back(NewBB);
610  }
611 }
612 
613 /// This helper function collects the set of successors of block
614 /// \p BB that are allowed to be its layout successors, and return
615 /// the total branch probability of edges from \p BB to those
616 /// blocks.
617 BranchProbability MachineBlockPlacement::collectViableSuccessors(
618  const MachineBasicBlock *BB, const BlockChain &Chain,
619  const BlockFilterSet *BlockFilter,
621  // Adjust edge probabilities by excluding edges pointing to blocks that is
622  // either not in BlockFilter or is already in the current chain. Consider the
623  // following CFG:
624  //
625  // --->A
626  // | / \
627  // | B C
628  // | \ / \
629  // ----D E
630  //
631  // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
632  // A->C is chosen as a fall-through, D won't be selected as a successor of C
633  // due to CFG constraint (the probability of C->D is not greater than
634  // HotProb to break topo-order). If we exclude E that is not in BlockFilter
635  // when calculating the probability of C->D, D will be selected and we
636  // will get A C D B as the layout of this loop.
637  auto AdjustedSumProb = BranchProbability::getOne();
638  for (MachineBasicBlock *Succ : BB->successors()) {
639  bool SkipSucc = false;
640  if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
641  SkipSucc = true;
642  } else {
643  BlockChain *SuccChain = BlockToChain[Succ];
644  if (SuccChain == &Chain) {
645  SkipSucc = true;
646  } else if (Succ != *SuccChain->begin()) {
647  LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
648  << " -> Mid chain!\n");
649  continue;
650  }
651  }
652  if (SkipSucc)
653  AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
654  else
655  Successors.push_back(Succ);
656  }
657 
658  return AdjustedSumProb;
659 }
660 
661 /// The helper function returns the branch probability that is adjusted
662 /// or normalized over the new total \p AdjustedSumProb.
663 static BranchProbability
665  BranchProbability AdjustedSumProb) {
666  BranchProbability SuccProb;
667  uint32_t SuccProbN = OrigProb.getNumerator();
668  uint32_t SuccProbD = AdjustedSumProb.getNumerator();
669  if (SuccProbN >= SuccProbD)
670  SuccProb = BranchProbability::getOne();
671  else
672  SuccProb = BranchProbability(SuccProbN, SuccProbD);
673 
674  return SuccProb;
675 }
676 
677 /// Check if \p BB has exactly the successors in \p Successors.
678 static bool
681  if (BB.succ_size() != Successors.size())
682  return false;
683  // We don't want to count self-loops
684  if (Successors.count(&BB))
685  return false;
686  for (MachineBasicBlock *Succ : BB.successors())
687  if (!Successors.count(Succ))
688  return false;
689  return true;
690 }
691 
692 /// Check if a block should be tail duplicated to increase fallthrough
693 /// opportunities.
694 /// \p BB Block to check.
695 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
696  // Blocks with single successors don't create additional fallthrough
697  // opportunities. Don't duplicate them. TODO: When conditional exits are
698  // analyzable, allow them to be duplicated.
699  bool IsSimple = TailDup.isSimpleBB(BB);
700 
701  if (BB->succ_size() == 1)
702  return false;
703  return TailDup.shouldTailDuplicate(IsSimple, *BB);
704 }
705 
706 /// Compare 2 BlockFrequency's with a small penalty for \p A.
707 /// In order to be conservative, we apply a X% penalty to account for
708 /// increased icache pressure and static heuristics. For small frequencies
709 /// we use only the numerators to improve accuracy. For simplicity, we assume the
710 /// penalty is less than 100%
711 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
713  uint64_t EntryFreq) {
714  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
715  BlockFrequency Gain = A - B;
716  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
717 }
718 
719 /// Check the edge frequencies to see if tail duplication will increase
720 /// fallthroughs. It only makes sense to call this function when
721 /// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
722 /// always locally profitable if we would have picked \p Succ without
723 /// considering duplication.
724 bool MachineBlockPlacement::isProfitableToTailDup(
725  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
726  BranchProbability QProb,
727  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
728  // We need to do a probability calculation to make sure this is profitable.
729  // First: does succ have a successor that post-dominates? This affects the
730  // calculation. The 2 relevant cases are:
731  // BB BB
732  // | \Qout | \Qout
733  // P| C |P C
734  // = C' = C'
735  // | /Qin | /Qin
736  // | / | /
737  // Succ Succ
738  // / \ | \ V
739  // U/ =V |U \
740  // / \ = D
741  // D E | /
742  // | /
743  // |/
744  // PDom
745  // '=' : Branch taken for that CFG edge
746  // In the second case, Placing Succ while duplicating it into C prevents the
747  // fallthrough of Succ into either D or PDom, because they now have C as an
748  // unplaced predecessor
749 
750  // Start by figuring out which case we fall into
751  MachineBasicBlock *PDom = nullptr;
753  // Only scan the relevant successors
754  auto AdjustedSuccSumProb =
755  collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
756  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
757  auto BBFreq = MBFI->getBlockFreq(BB);
758  auto SuccFreq = MBFI->getBlockFreq(Succ);
759  BlockFrequency P = BBFreq * PProb;
760  BlockFrequency Qout = BBFreq * QProb;
761  uint64_t EntryFreq = MBFI->getEntryFreq();
762  // If there are no more successors, it is profitable to copy, as it strictly
763  // increases fallthrough.
764  if (SuccSuccs.size() == 0)
765  return greaterWithBias(P, Qout, EntryFreq);
766 
767  auto BestSuccSucc = BranchProbability::getZero();
768  // Find the PDom or the best Succ if no PDom exists.
769  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
770  auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
771  if (Prob > BestSuccSucc)
772  BestSuccSucc = Prob;
773  if (PDom == nullptr)
774  if (MPDT->dominates(SuccSucc, Succ)) {
775  PDom = SuccSucc;
776  break;
777  }
778  }
779  // For the comparisons, we need to know Succ's best incoming edge that isn't
780  // from BB.
781  auto SuccBestPred = BlockFrequency(0);
782  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
783  if (SuccPred == Succ || SuccPred == BB
784  || BlockToChain[SuccPred] == &Chain
785  || (BlockFilter && !BlockFilter->count(SuccPred)))
786  continue;
787  auto Freq = MBFI->getBlockFreq(SuccPred)
788  * MBPI->getEdgeProbability(SuccPred, Succ);
789  if (Freq > SuccBestPred)
790  SuccBestPred = Freq;
791  }
792  // Qin is Succ's best unplaced incoming edge that isn't BB
793  BlockFrequency Qin = SuccBestPred;
794  // If it doesn't have a post-dominating successor, here is the calculation:
795  // BB BB
796  // | \Qout | \
797  // P| C | =
798  // = C' | C
799  // | /Qin | |
800  // | / | C' (+Succ)
801  // Succ Succ /|
802  // / \ | \/ |
803  // U/ =V | == |
804  // / \ | / \|
805  // D E D E
806  // '=' : Branch taken for that CFG edge
807  // Cost in the first case is: P + V
808  // For this calculation, we always assume P > Qout. If Qout > P
809  // The result of this function will be ignored at the caller.
810  // Let F = SuccFreq - Qin
811  // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
812 
813  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
814  BranchProbability UProb = BestSuccSucc;
815  BranchProbability VProb = AdjustedSuccSumProb - UProb;
816  BlockFrequency F = SuccFreq - Qin;
817  BlockFrequency V = SuccFreq * VProb;
818  BlockFrequency QinU = std::min(Qin, F) * UProb;
819  BlockFrequency BaseCost = P + V;
820  BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
821  return greaterWithBias(BaseCost, DupCost, EntryFreq);
822  }
823  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
824  BranchProbability VProb = AdjustedSuccSumProb - UProb;
825  BlockFrequency U = SuccFreq * UProb;
826  BlockFrequency V = SuccFreq * VProb;
827  BlockFrequency F = SuccFreq - Qin;
828  // If there is a post-dominating successor, here is the calculation:
829  // BB BB BB BB
830  // | \Qout | \ | \Qout | \
831  // |P C | = |P C | =
832  // = C' |P C = C' |P C
833  // | /Qin | | | /Qin | |
834  // | / | C' (+Succ) | / | C' (+Succ)
835  // Succ Succ /| Succ Succ /|
836  // | \ V | \/ | | \ V | \/ |
837  // |U \ |U /\ =? |U = |U /\ |
838  // = D = = =?| | D | = =|
839  // | / |/ D | / |/ D
840  // | / | / | = | /
841  // |/ | / |/ | =
842  // Dom Dom Dom Dom
843  // '=' : Branch taken for that CFG edge
844  // The cost for taken branches in the first case is P + U
845  // Let F = SuccFreq - Qin
846  // The cost in the second case (assuming independence), given the layout:
847  // BB, Succ, (C+Succ), D, Dom or the layout:
848  // BB, Succ, D, Dom, (C+Succ)
849  // is Qout + max(F, Qin) * U + min(F, Qin)
850  // compare P + U vs Qout + P * U + Qin.
851  //
852  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
853  //
854  // For the 3rd case, the cost is P + 2 * V
855  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
856  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
857  if (UProb > AdjustedSuccSumProb / 2 &&
858  !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
859  Chain, BlockFilter))
860  // Cases 3 & 4
861  return greaterWithBias(
862  (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
863  EntryFreq);
864  // Cases 1 & 2
865  return greaterWithBias((P + U),
866  (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
867  std::max(Qin, F) * UProb),
868  EntryFreq);
869 }
870 
871 /// Check for a trellis layout. \p BB is the upper part of a trellis if its
872 /// successors form the lower part of a trellis. A successor set S forms the
873 /// lower part of a trellis if all of the predecessors of S are either in S or
874 /// have all of S as successors. We ignore trellises where BB doesn't have 2
875 /// successors because for fewer than 2, it's trivial, and for 3 or greater they
876 /// are very uncommon and complex to compute optimally. Allowing edges within S
877 /// is not strictly a trellis, but the same algorithm works, so we allow it.
878 bool MachineBlockPlacement::isTrellis(
879  const MachineBasicBlock *BB,
880  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
881  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
882  // Technically BB could form a trellis with branching factor higher than 2.
883  // But that's extremely uncommon.
884  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
885  return false;
886 
888  BB->succ_end());
889  // To avoid reviewing the same predecessors twice.
891 
892  for (MachineBasicBlock *Succ : ViableSuccs) {
893  int PredCount = 0;
894  for (auto SuccPred : Succ->predecessors()) {
895  // Allow triangle successors, but don't count them.
896  if (Successors.count(SuccPred)) {
897  // Make sure that it is actually a triangle.
898  for (MachineBasicBlock *CheckSucc : SuccPred->successors())
899  if (!Successors.count(CheckSucc))
900  return false;
901  continue;
902  }
903  const BlockChain *PredChain = BlockToChain[SuccPred];
904  if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
905  PredChain == &Chain || PredChain == BlockToChain[Succ])
906  continue;
907  ++PredCount;
908  // Perform the successor check only once.
909  if (!SeenPreds.insert(SuccPred).second)
910  continue;
911  if (!hasSameSuccessors(*SuccPred, Successors))
912  return false;
913  }
914  // If one of the successors has only BB as a predecessor, it is not a
915  // trellis.
916  if (PredCount < 1)
917  return false;
918  }
919  return true;
920 }
921 
922 /// Pick the highest total weight pair of edges that can both be laid out.
923 /// The edges in \p Edges[0] are assumed to have a different destination than
924 /// the edges in \p Edges[1]. Simple counting shows that the best pair is either
925 /// the individual highest weight edges to the 2 different destinations, or in
926 /// case of a conflict, one of them should be replaced with a 2nd best edge.
927 std::pair<MachineBlockPlacement::WeightedEdge,
928  MachineBlockPlacement::WeightedEdge>
929 MachineBlockPlacement::getBestNonConflictingEdges(
930  const MachineBasicBlock *BB,
932  Edges) {
933  // Sort the edges, and then for each successor, find the best incoming
934  // predecessor. If the best incoming predecessors aren't the same,
935  // then that is clearly the best layout. If there is a conflict, one of the
936  // successors will have to fallthrough from the second best predecessor. We
937  // compare which combination is better overall.
938 
939  // Sort for highest frequency.
940  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
941 
942  std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp);
943  std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp);
944  auto BestA = Edges[0].begin();
945  auto BestB = Edges[1].begin();
946  // Arrange for the correct answer to be in BestA and BestB
947  // If the 2 best edges don't conflict, the answer is already there.
948  if (BestA->Src == BestB->Src) {
949  // Compare the total fallthrough of (Best + Second Best) for both pairs
950  auto SecondBestA = std::next(BestA);
951  auto SecondBestB = std::next(BestB);
952  BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
953  BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
954  if (BestAScore < BestBScore)
955  BestA = SecondBestA;
956  else
957  BestB = SecondBestB;
958  }
959  // Arrange for the BB edge to be in BestA if it exists.
960  if (BestB->Src == BB)
961  std::swap(BestA, BestB);
962  return std::make_pair(*BestA, *BestB);
963 }
964 
965 /// Get the best successor from \p BB based on \p BB being part of a trellis.
966 /// We only handle trellises with 2 successors, so the algorithm is
967 /// straightforward: Find the best pair of edges that don't conflict. We find
968 /// the best incoming edge for each successor in the trellis. If those conflict,
969 /// we consider which of them should be replaced with the second best.
970 /// Upon return the two best edges will be in \p BestEdges. If one of the edges
971 /// comes from \p BB, it will be in \p BestEdges[0]
972 MachineBlockPlacement::BlockAndTailDupResult
973 MachineBlockPlacement::getBestTrellisSuccessor(
974  const MachineBasicBlock *BB,
975  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
976  BranchProbability AdjustedSumProb, const BlockChain &Chain,
977  const BlockFilterSet *BlockFilter) {
978 
979  BlockAndTailDupResult Result = {nullptr, false};
981  BB->succ_end());
982 
983  // We assume size 2 because it's common. For general n, we would have to do
984  // the Hungarian algorithm, but it's not worth the complexity because more
985  // than 2 successors is fairly uncommon, and a trellis even more so.
986  if (Successors.size() != 2 || ViableSuccs.size() != 2)
987  return Result;
988 
989  // Collect the edge frequencies of all edges that form the trellis.
991  int SuccIndex = 0;
992  for (auto Succ : ViableSuccs) {
993  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
994  // Skip any placed predecessors that are not BB
995  if (SuccPred != BB)
996  if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
997  BlockToChain[SuccPred] == &Chain ||
998  BlockToChain[SuccPred] == BlockToChain[Succ])
999  continue;
1000  BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1001  MBPI->getEdgeProbability(SuccPred, Succ);
1002  Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1003  }
1004  ++SuccIndex;
1005  }
1006 
1007  // Pick the best combination of 2 edges from all the edges in the trellis.
1008  WeightedEdge BestA, BestB;
1009  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1010 
1011  if (BestA.Src != BB) {
1012  // If we have a trellis, and BB doesn't have the best fallthrough edges,
1013  // we shouldn't choose any successor. We've already looked and there's a
1014  // better fallthrough edge for all the successors.
1015  LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1016  return Result;
1017  }
1018 
1019  // Did we pick the triangle edge? If tail-duplication is profitable, do
1020  // that instead. Otherwise merge the triangle edge now while we know it is
1021  // optimal.
1022  if (BestA.Dest == BestB.Src) {
1023  // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1024  // would be better.
1025  MachineBasicBlock *Succ1 = BestA.Dest;
1026  MachineBasicBlock *Succ2 = BestB.Dest;
1027  // Check to see if tail-duplication would be profitable.
1028  if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
1029  canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1030  isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1031  Chain, BlockFilter)) {
1033  MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1034  dbgs() << " Selected: " << getBlockName(Succ2)
1035  << ", probability: " << Succ2Prob
1036  << " (Tail Duplicate)\n");
1037  Result.BB = Succ2;
1038  Result.ShouldTailDup = true;
1039  return Result;
1040  }
1041  }
1042  // We have already computed the optimal edge for the other side of the
1043  // trellis.
1044  ComputedEdges[BestB.Src] = { BestB.Dest, false };
1045 
1046  auto TrellisSucc = BestA.Dest;
1048  MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1049  dbgs() << " Selected: " << getBlockName(TrellisSucc)
1050  << ", probability: " << SuccProb << " (Trellis)\n");
1051  Result.BB = TrellisSucc;
1052  return Result;
1053 }
1054 
1055 /// When the option allowTailDupPlacement() is on, this method checks if the
1056 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1057 /// into all of its unplaced, unfiltered predecessors, that are not BB.
1058 bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1059  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1060  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1061  if (!shouldTailDuplicate(Succ))
1062  return false;
1063 
1064  // For CFG checking.
1066  BB->succ_end());
1067  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1068  // Make sure all unplaced and unfiltered predecessors can be
1069  // tail-duplicated into.
1070  // Skip any blocks that are already placed or not in this loop.
1071  if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
1072  || BlockToChain[Pred] == &Chain)
1073  continue;
1074  if (!TailDup.canTailDuplicate(Succ, Pred)) {
1075  if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1076  // This will result in a trellis after tail duplication, so we don't
1077  // need to copy Succ into this predecessor. In the presence
1078  // of a trellis tail duplication can continue to be profitable.
1079  // For example:
1080  // A A
1081  // |\ |\
1082  // | \ | \
1083  // | C | C+BB
1084  // | / | |
1085  // |/ | |
1086  // BB => BB |
1087  // |\ |\/|
1088  // | \ |/\|
1089  // | D | D
1090  // | / | /
1091  // |/ |/
1092  // Succ Succ
1093  //
1094  // After BB was duplicated into C, the layout looks like the one on the
1095  // right. BB and C now have the same successors. When considering
1096  // whether Succ can be duplicated into all its unplaced predecessors, we
1097  // ignore C.
1098  // We can do this because C already has a profitable fallthrough, namely
1099  // D. TODO(iteratee): ignore sufficiently cold predecessors for
1100  // duplication and for this test.
1101  //
1102  // This allows trellises to be laid out in 2 separate chains
1103  // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1104  // because it allows the creation of 2 fallthrough paths with links
1105  // between them, and we correctly identify the best layout for these
1106  // CFGs. We want to extend trellises that the user created in addition
1107  // to trellises created by tail-duplication, so we just look for the
1108  // CFG.
1109  continue;
1110  return false;
1111  }
1112  }
1113  return true;
1114 }
1115 
1116 /// Find chains of triangles where we believe it would be profitable to
1117 /// tail-duplicate them all, but a local analysis would not find them.
1118 /// There are 3 ways this can be profitable:
1119 /// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1120 /// longer chains)
1121 /// 2) The chains are statically correlated. Branch probabilities have a very
1122 /// U-shaped distribution.
1123 /// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1124 /// If the branches in a chain are likely to be from the same side of the
1125 /// distribution as their predecessor, but are independent at runtime, this
1126 /// transformation is profitable. (Because the cost of being wrong is a small
1127 /// fixed cost, unlike the standard triangle layout where the cost of being
1128 /// wrong scales with the # of triangles.)
1129 /// 3) The chains are dynamically correlated. If the probability that a previous
1130 /// branch was taken positively influences whether the next branch will be
1131 /// taken
1132 /// We believe that 2 and 3 are common enough to justify the small margin in 1.
1133 void MachineBlockPlacement::precomputeTriangleChains() {
1134  struct TriangleChain {
1135  std::vector<MachineBasicBlock *> Edges;
1136 
1137  TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1138  : Edges({src, dst}) {}
1139 
1140  void append(MachineBasicBlock *dst) {
1141  assert(getKey()->isSuccessor(dst) &&
1142  "Attempting to append a block that is not a successor.");
1143  Edges.push_back(dst);
1144  }
1145 
1146  unsigned count() const { return Edges.size() - 1; }
1147 
1148  MachineBasicBlock *getKey() const {
1149  return Edges.back();
1150  }
1151  };
1152 
1153  if (TriangleChainCount == 0)
1154  return;
1155 
1156  LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1157  // Map from last block to the chain that contains it. This allows us to extend
1158  // chains as we find new triangles.
1160  for (MachineBasicBlock &BB : *F) {
1161  // If BB doesn't have 2 successors, it doesn't start a triangle.
1162  if (BB.succ_size() != 2)
1163  continue;
1164  MachineBasicBlock *PDom = nullptr;
1165  for (MachineBasicBlock *Succ : BB.successors()) {
1166  if (!MPDT->dominates(Succ, &BB))
1167  continue;
1168  PDom = Succ;
1169  break;
1170  }
1171  // If BB doesn't have a post-dominating successor, it doesn't form a
1172  // triangle.
1173  if (PDom == nullptr)
1174  continue;
1175  // If PDom has a hint that it is low probability, skip this triangle.
1176  if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1177  continue;
1178  // If PDom isn't eligible for duplication, this isn't the kind of triangle
1179  // we're looking for.
1180  if (!shouldTailDuplicate(PDom))
1181  continue;
1182  bool CanTailDuplicate = true;
1183  // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1184  // isn't the kind of triangle we're looking for.
1185  for (MachineBasicBlock* Pred : PDom->predecessors()) {
1186  if (Pred == &BB)
1187  continue;
1188  if (!TailDup.canTailDuplicate(PDom, Pred)) {
1189  CanTailDuplicate = false;
1190  break;
1191  }
1192  }
1193  // If we can't tail-duplicate PDom to its predecessors, then skip this
1194  // triangle.
1195  if (!CanTailDuplicate)
1196  continue;
1197 
1198  // Now we have an interesting triangle. Insert it if it's not part of an
1199  // existing chain.
1200  // Note: This cannot be replaced with a call insert() or emplace() because
1201  // the find key is BB, but the insert/emplace key is PDom.
1202  auto Found = TriangleChainMap.find(&BB);
1203  // If it is, remove the chain from the map, grow it, and put it back in the
1204  // map with the end as the new key.
1205  if (Found != TriangleChainMap.end()) {
1206  TriangleChain Chain = std::move(Found->second);
1207  TriangleChainMap.erase(Found);
1208  Chain.append(PDom);
1209  TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1210  } else {
1211  auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1212  assert(InsertResult.second && "Block seen twice.");
1213  (void)InsertResult;
1214  }
1215  }
1216 
1217  // Iterating over a DenseMap is safe here, because the only thing in the body
1218  // of the loop is inserting into another DenseMap (ComputedEdges).
1219  // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1220  for (auto &ChainPair : TriangleChainMap) {
1221  TriangleChain &Chain = ChainPair.second;
1222  // Benchmarking has shown that due to branch correlation duplicating 2 or
1223  // more triangles is profitable, despite the calculations assuming
1224  // independence.
1225  if (Chain.count() < TriangleChainCount)
1226  continue;
1227  MachineBasicBlock *dst = Chain.Edges.back();
1228  Chain.Edges.pop_back();
1229  for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1230  LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1231  << getBlockName(dst)
1232  << " as pre-computed based on triangles.\n");
1233 
1234  auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1235  assert(InsertResult.second && "Block seen twice.");
1236  (void)InsertResult;
1237 
1238  dst = src;
1239  }
1240  }
1241 }
1242 
1243 // When profile is not present, return the StaticLikelyProb.
1244 // When profile is available, we need to handle the triangle-shape CFG.
1246  const MachineBasicBlock *BB) {
1247  if (!BB->getParent()->getFunction().hasProfileData())
1248  return BranchProbability(StaticLikelyProb, 100);
1249  if (BB->succ_size() == 2) {
1250  const MachineBasicBlock *Succ1 = *BB->succ_begin();
1251  const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1252  if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1253  /* See case 1 below for the cost analysis. For BB->Succ to
1254  * be taken with smaller cost, the following needs to hold:
1255  * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1256  * So the threshold T in the calculation below
1257  * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1258  * So T / (1 - T) = 2, Yielding T = 2/3
1259  * Also adding user specified branch bias, we have
1260  * T = (2/3)*(ProfileLikelyProb/50)
1261  * = (2*ProfileLikelyProb)/150)
1262  */
1263  return BranchProbability(2 * ProfileLikelyProb, 150);
1264  }
1265  }
1266  return BranchProbability(ProfileLikelyProb, 100);
1267 }
1268 
1269 /// Checks to see if the layout candidate block \p Succ has a better layout
1270 /// predecessor than \c BB. If yes, returns true.
1271 /// \p SuccProb: The probability adjusted for only remaining blocks.
1272 /// Only used for logging
1273 /// \p RealSuccProb: The un-adjusted probability.
1274 /// \p Chain: The chain that BB belongs to and Succ is being considered for.
1275 /// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1276 /// considered
1277 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1278  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1279  const BlockChain &SuccChain, BranchProbability SuccProb,
1280  BranchProbability RealSuccProb, const BlockChain &Chain,
1281  const BlockFilterSet *BlockFilter) {
1282 
1283  // There isn't a better layout when there are no unscheduled predecessors.
1284  if (SuccChain.UnscheduledPredecessors == 0)
1285  return false;
1286 
1287  // There are two basic scenarios here:
1288  // -------------------------------------
1289  // Case 1: triangular shape CFG (if-then):
1290  // BB
1291  // | \
1292  // | \
1293  // | Pred
1294  // | /
1295  // Succ
1296  // In this case, we are evaluating whether to select edge -> Succ, e.g.
1297  // set Succ as the layout successor of BB. Picking Succ as BB's
1298  // successor breaks the CFG constraints (FIXME: define these constraints).
1299  // With this layout, Pred BB
1300  // is forced to be outlined, so the overall cost will be cost of the
1301  // branch taken from BB to Pred, plus the cost of back taken branch
1302  // from Pred to Succ, as well as the additional cost associated
1303  // with the needed unconditional jump instruction from Pred To Succ.
1304 
1305  // The cost of the topological order layout is the taken branch cost
1306  // from BB to Succ, so to make BB->Succ a viable candidate, the following
1307  // must hold:
1308  // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1309  // < freq(BB->Succ) * taken_branch_cost.
1310  // Ignoring unconditional jump cost, we get
1311  // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1312  // prob(BB->Succ) > 2 * prob(BB->Pred)
1313  //
1314  // When real profile data is available, we can precisely compute the
1315  // probability threshold that is needed for edge BB->Succ to be considered.
1316  // Without profile data, the heuristic requires the branch bias to be
1317  // a lot larger to make sure the signal is very strong (e.g. 80% default).
1318  // -----------------------------------------------------------------
1319  // Case 2: diamond like CFG (if-then-else):
1320  // S
1321  // / \
1322  // | \
1323  // BB Pred
1324  // \ /
1325  // Succ
1326  // ..
1327  //
1328  // The current block is BB and edge BB->Succ is now being evaluated.
1329  // Note that edge S->BB was previously already selected because
1330  // prob(S->BB) > prob(S->Pred).
1331  // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1332  // choose Pred, we will have a topological ordering as shown on the left
1333  // in the picture below. If we choose Succ, we have the solution as shown
1334  // on the right:
1335  //
1336  // topo-order:
1337  //
1338  // S----- ---S
1339  // | | | |
1340  // ---BB | | BB
1341  // | | | |
1342  // | Pred-- | Succ--
1343  // | | | |
1344  // ---Succ ---Pred--
1345  //
1346  // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1347  // = freq(S->Pred) + freq(S->BB)
1348  //
1349  // If we have profile data (i.e, branch probabilities can be trusted), the
1350  // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1351  // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1352  // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1353  // means the cost of topological order is greater.
1354  // When profile data is not available, however, we need to be more
1355  // conservative. If the branch prediction is wrong, breaking the topo-order
1356  // will actually yield a layout with large cost. For this reason, we need
1357  // strong biased branch at block S with Prob(S->BB) in order to select
1358  // BB->Succ. This is equivalent to looking the CFG backward with backward
1359  // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1360  // profile data).
1361  // --------------------------------------------------------------------------
1362  // Case 3: forked diamond
1363  // S
1364  // / \
1365  // / \
1366  // BB Pred
1367  // | \ / |
1368  // | \ / |
1369  // | X |
1370  // | / \ |
1371  // | / \ |
1372  // S1 S2
1373  //
1374  // The current block is BB and edge BB->S1 is now being evaluated.
1375  // As above S->BB was already selected because
1376  // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1377  //
1378  // topo-order:
1379  //
1380  // S-------| ---S
1381  // | | | |
1382  // ---BB | | BB
1383  // | | | |
1384  // | Pred----| | S1----
1385  // | | | |
1386  // --(S1 or S2) ---Pred--
1387  // |
1388  // S2
1389  //
1390  // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1391  // + min(freq(Pred->S1), freq(Pred->S2))
1392  // Non-topo-order cost:
1393  // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1394  // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1395  // is 0. Then the non topo layout is better when
1396  // freq(S->Pred) < freq(BB->S1).
1397  // This is exactly what is checked below.
1398  // Note there are other shapes that apply (Pred may not be a single block,
1399  // but they all fit this general pattern.)
1401 
1402  // Make sure that a hot successor doesn't have a globally more
1403  // important predecessor.
1404  BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1405  bool BadCFGConflict = false;
1406 
1407  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1408  if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
1409  (BlockFilter && !BlockFilter->count(Pred)) ||
1410  BlockToChain[Pred] == &Chain ||
1411  // This check is redundant except for look ahead. This function is
1412  // called for lookahead by isProfitableToTailDup when BB hasn't been
1413  // placed yet.
1414  (Pred == BB))
1415  continue;
1416  // Do backward checking.
1417  // For all cases above, we need a backward checking to filter out edges that
1418  // are not 'strongly' biased.
1419  // BB Pred
1420  // \ /
1421  // Succ
1422  // We select edge BB->Succ if
1423  // freq(BB->Succ) > freq(Succ) * HotProb
1424  // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1425  // HotProb
1426  // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1427  // Case 1 is covered too, because the first equation reduces to:
1428  // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1429  BlockFrequency PredEdgeFreq =
1430  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1431  if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1432  BadCFGConflict = true;
1433  break;
1434  }
1435  }
1436 
1437  if (BadCFGConflict) {
1438  LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1439  << SuccProb << " (prob) (non-cold CFG conflict)\n");
1440  return true;
1441  }
1442 
1443  return false;
1444 }
1445 
1446 /// Select the best successor for a block.
1447 ///
1448 /// This looks across all successors of a particular block and attempts to
1449 /// select the "best" one to be the layout successor. It only considers direct
1450 /// successors which also pass the block filter. It will attempt to avoid
1451 /// breaking CFG structure, but cave and break such structures in the case of
1452 /// very hot successor edges.
1453 ///
1454 /// \returns The best successor block found, or null if none are viable, along
1455 /// with a boolean indicating if tail duplication is necessary.
1456 MachineBlockPlacement::BlockAndTailDupResult
1457 MachineBlockPlacement::selectBestSuccessor(
1458  const MachineBasicBlock *BB, const BlockChain &Chain,
1459  const BlockFilterSet *BlockFilter) {
1460  const BranchProbability HotProb(StaticLikelyProb, 100);
1461 
1462  BlockAndTailDupResult BestSucc = { nullptr, false };
1463  auto BestProb = BranchProbability::getZero();
1464 
1466  auto AdjustedSumProb =
1467  collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1468 
1469  LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1470  << "\n");
1471 
1472  // if we already precomputed the best successor for BB, return that if still
1473  // applicable.
1474  auto FoundEdge = ComputedEdges.find(BB);
1475  if (FoundEdge != ComputedEdges.end()) {
1476  MachineBasicBlock *Succ = FoundEdge->second.BB;
1477  ComputedEdges.erase(FoundEdge);
1478  BlockChain *SuccChain = BlockToChain[Succ];
1479  if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1480  SuccChain != &Chain && Succ == *SuccChain->begin())
1481  return FoundEdge->second;
1482  }
1483 
1484  // if BB is part of a trellis, Use the trellis to determine the optimal
1485  // fallthrough edges
1486  if (isTrellis(BB, Successors, Chain, BlockFilter))
1487  return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1488  BlockFilter);
1489 
1490  // For blocks with CFG violations, we may be able to lay them out anyway with
1491  // tail-duplication. We keep this vector so we can perform the probability
1492  // calculations the minimum number of times.
1494  DupCandidates;
1495  for (MachineBasicBlock *Succ : Successors) {
1496  auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1497  BranchProbability SuccProb =
1498  getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1499 
1500  BlockChain &SuccChain = *BlockToChain[Succ];
1501  // Skip the edge \c BB->Succ if block \c Succ has a better layout
1502  // predecessor that yields lower global cost.
1503  if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1504  Chain, BlockFilter)) {
1505  // If tail duplication would make Succ profitable, place it.
1506  if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1507  DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
1508  continue;
1509  }
1510 
1511  LLVM_DEBUG(
1512  dbgs() << " Candidate: " << getBlockName(Succ)
1513  << ", probability: " << SuccProb
1514  << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1515  << "\n");
1516 
1517  if (BestSucc.BB && BestProb >= SuccProb) {
1518  LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1519  continue;
1520  }
1521 
1522  LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1523  BestSucc.BB = Succ;
1524  BestProb = SuccProb;
1525  }
1526  // Handle the tail duplication candidates in order of decreasing probability.
1527  // Stop at the first one that is profitable. Also stop if they are less
1528  // profitable than BestSucc. Position is important because we preserve it and
1529  // prefer first best match. Here we aren't comparing in order, so we capture
1530  // the position instead.
1531  if (DupCandidates.size() != 0) {
1532  auto cmp =
1533  [](const std::tuple<BranchProbability, MachineBasicBlock *> &a,
1534  const std::tuple<BranchProbability, MachineBasicBlock *> &b) {
1535  return std::get<0>(a) > std::get<0>(b);
1536  };
1537  std::stable_sort(DupCandidates.begin(), DupCandidates.end(), cmp);
1538  }
1539  for(auto &Tup : DupCandidates) {
1540  BranchProbability DupProb;
1541  MachineBasicBlock *Succ;
1542  std::tie(DupProb, Succ) = Tup;
1543  if (DupProb < BestProb)
1544  break;
1545  if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1546  && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1547  LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1548  << ", probability: " << DupProb
1549  << " (Tail Duplicate)\n");
1550  BestSucc.BB = Succ;
1551  BestSucc.ShouldTailDup = true;
1552  break;
1553  }
1554  }
1555 
1556  if (BestSucc.BB)
1557  LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1558 
1559  return BestSucc;
1560 }
1561 
1562 /// Select the best block from a worklist.
1563 ///
1564 /// This looks through the provided worklist as a list of candidate basic
1565 /// blocks and select the most profitable one to place. The definition of
1566 /// profitable only really makes sense in the context of a loop. This returns
1567 /// the most frequently visited block in the worklist, which in the case of
1568 /// a loop, is the one most desirable to be physically close to the rest of the
1569 /// loop body in order to improve i-cache behavior.
1570 ///
1571 /// \returns The best block found, or null if none are viable.
1572 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1573  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1574  // Once we need to walk the worklist looking for a candidate, cleanup the
1575  // worklist of already placed entries.
1576  // FIXME: If this shows up on profiles, it could be folded (at the cost of
1577  // some code complexity) into the loop below.
1578  WorkList.erase(llvm::remove_if(WorkList,
1579  [&](MachineBasicBlock *BB) {
1580  return BlockToChain.lookup(BB) == &Chain;
1581  }),
1582  WorkList.end());
1583 
1584  if (WorkList.empty())
1585  return nullptr;
1586 
1587  bool IsEHPad = WorkList[0]->isEHPad();
1588 
1589  MachineBasicBlock *BestBlock = nullptr;
1590  BlockFrequency BestFreq;
1591  for (MachineBasicBlock *MBB : WorkList) {
1592  assert(MBB->isEHPad() == IsEHPad &&
1593  "EHPad mismatch between block and work list.");
1594 
1595  BlockChain &SuccChain = *BlockToChain[MBB];
1596  if (&SuccChain == &Chain)
1597  continue;
1598 
1599  assert(SuccChain.UnscheduledPredecessors == 0 &&
1600  "Found CFG-violating block");
1601 
1602  BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1603  LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> ";
1604  MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
1605 
1606  // For ehpad, we layout the least probable first as to avoid jumping back
1607  // from least probable landingpads to more probable ones.
1608  //
1609  // FIXME: Using probability is probably (!) not the best way to achieve
1610  // this. We should probably have a more principled approach to layout
1611  // cleanup code.
1612  //
1613  // The goal is to get:
1614  //
1615  // +--------------------------+
1616  // | V
1617  // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1618  //
1619  // Rather than:
1620  //
1621  // +-------------------------------------+
1622  // V |
1623  // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1624  if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1625  continue;
1626 
1627  BestBlock = MBB;
1628  BestFreq = CandidateFreq;
1629  }
1630 
1631  return BestBlock;
1632 }
1633 
1634 /// Retrieve the first unplaced basic block.
1635 ///
1636 /// This routine is called when we are unable to use the CFG to walk through
1637 /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1638 /// We walk through the function's blocks in order, starting from the
1639 /// LastUnplacedBlockIt. We update this iterator on each call to avoid
1640 /// re-scanning the entire sequence on repeated calls to this routine.
1641 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1642  const BlockChain &PlacedChain,
1643  MachineFunction::iterator &PrevUnplacedBlockIt,
1644  const BlockFilterSet *BlockFilter) {
1645  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1646  ++I) {
1647  if (BlockFilter && !BlockFilter->count(&*I))
1648  continue;
1649  if (BlockToChain[&*I] != &PlacedChain) {
1650  PrevUnplacedBlockIt = I;
1651  // Now select the head of the chain to which the unplaced block belongs
1652  // as the block to place. This will force the entire chain to be placed,
1653  // and satisfies the requirements of merging chains.
1654  return *BlockToChain[&*I]->begin();
1655  }
1656  }
1657  return nullptr;
1658 }
1659 
1660 void MachineBlockPlacement::fillWorkLists(
1661  const MachineBasicBlock *MBB,
1662  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1663  const BlockFilterSet *BlockFilter = nullptr) {
1664  BlockChain &Chain = *BlockToChain[MBB];
1665  if (!UpdatedPreds.insert(&Chain).second)
1666  return;
1667 
1668  assert(
1669  Chain.UnscheduledPredecessors == 0 &&
1670  "Attempting to place block with unscheduled predecessors in worklist.");
1671  for (MachineBasicBlock *ChainBB : Chain) {
1672  assert(BlockToChain[ChainBB] == &Chain &&
1673  "Block in chain doesn't match BlockToChain map.");
1674  for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1675  if (BlockFilter && !BlockFilter->count(Pred))
1676  continue;
1677  if (BlockToChain[Pred] == &Chain)
1678  continue;
1679  ++Chain.UnscheduledPredecessors;
1680  }
1681  }
1682 
1683  if (Chain.UnscheduledPredecessors != 0)
1684  return;
1685 
1686  MachineBasicBlock *BB = *Chain.begin();
1687  if (BB->isEHPad())
1688  EHPadWorkList.push_back(BB);
1689  else
1690  BlockWorkList.push_back(BB);
1691 }
1692 
1693 void MachineBlockPlacement::buildChain(
1694  const MachineBasicBlock *HeadBB, BlockChain &Chain,
1695  BlockFilterSet *BlockFilter) {
1696  assert(HeadBB && "BB must not be null.\n");
1697  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1698  MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1699 
1700  const MachineBasicBlock *LoopHeaderBB = HeadBB;
1701  markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1702  MachineBasicBlock *BB = *std::prev(Chain.end());
1703  while (true) {
1704  assert(BB && "null block found at end of chain in loop.");
1705  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1706  assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1707 
1708 
1709  // Look for the best viable successor if there is one to place immediately
1710  // after this block.
1711  auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1712  MachineBasicBlock* BestSucc = Result.BB;
1713  bool ShouldTailDup = Result.ShouldTailDup;
1714  if (allowTailDupPlacement())
1715  ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc));
1716 
1717  // If an immediate successor isn't available, look for the best viable
1718  // block among those we've identified as not violating the loop's CFG at
1719  // this point. This won't be a fallthrough, but it will increase locality.
1720  if (!BestSucc)
1721  BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1722  if (!BestSucc)
1723  BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1724 
1725  if (!BestSucc) {
1726  BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
1727  if (!BestSucc)
1728  break;
1729 
1730  LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1731  "layout successor until the CFG reduces\n");
1732  }
1733 
1734  // Placement may have changed tail duplication opportunities.
1735  // Check for that now.
1736  if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1737  // If the chosen successor was duplicated into all its predecessors,
1738  // don't bother laying it out, just go round the loop again with BB as
1739  // the chain end.
1740  if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1741  BlockFilter, PrevUnplacedBlockIt))
1742  continue;
1743  }
1744 
1745  // Place this block, updating the datastructures to reflect its placement.
1746  BlockChain &SuccChain = *BlockToChain[BestSucc];
1747  // Zero out UnscheduledPredecessors for the successor we're about to merge in case
1748  // we selected a successor that didn't fit naturally into the CFG.
1749  SuccChain.UnscheduledPredecessors = 0;
1750  LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1751  << getBlockName(BestSucc) << "\n");
1752  markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1753  Chain.merge(BestSucc, &SuccChain);
1754  BB = *std::prev(Chain.end());
1755  }
1756 
1757  LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1758  << getBlockName(*Chain.begin()) << "\n");
1759 }
1760 
1761 // If bottom of block BB has only one successor OldTop, in most cases it is
1762 // profitable to move it before OldTop, except the following case:
1763 //
1764 // -->OldTop<-
1765 // | . |
1766 // | . |
1767 // | . |
1768 // ---Pred |
1769 // | |
1770 // BB-----
1771 //
1772 // If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
1773 // layout the other successor below it, so it can't reduce taken branch.
1774 // In this case we keep its original layout.
1775 bool
1776 MachineBlockPlacement::canMoveBottomBlockToTop(
1777  const MachineBasicBlock *BottomBlock,
1778  const MachineBasicBlock *OldTop) {
1779  if (BottomBlock->pred_size() != 1)
1780  return true;
1781  MachineBasicBlock *Pred = *BottomBlock->pred_begin();
1782  if (Pred->succ_size() != 2)
1783  return true;
1784 
1785  MachineBasicBlock *OtherBB = *Pred->succ_begin();
1786  if (OtherBB == BottomBlock)
1787  OtherBB = *Pred->succ_rbegin();
1788  if (OtherBB == OldTop)
1789  return false;
1790 
1791  return true;
1792 }
1793 
1794 /// Find the best loop top block for layout.
1795 ///
1796 /// Look for a block which is strictly better than the loop header for laying
1797 /// out at the top of the loop. This looks for one and only one pattern:
1798 /// a latch block with no conditional exit. This block will cause a conditional
1799 /// jump around it or will be the bottom of the loop if we lay it out in place,
1800 /// but if it it doesn't end up at the bottom of the loop for any reason,
1801 /// rotation alone won't fix it. Because such a block will always result in an
1802 /// unconditional jump (for the backedge) rotating it in front of the loop
1803 /// header is always profitable.
1805 MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
1806  const BlockFilterSet &LoopBlockSet) {
1807  // Placing the latch block before the header may introduce an extra branch
1808  // that skips this block the first time the loop is executed, which we want
1809  // to avoid when optimising for size.
1810  // FIXME: in theory there is a case that does not introduce a new branch,
1811  // i.e. when the layout predecessor does not fallthrough to the loop header.
1812  // In practice this never happens though: there always seems to be a preheader
1813  // that can fallthrough and that is also placed before the header.
1814  if (F->getFunction().optForSize())
1815  return L.getHeader();
1816 
1817  // Check that the header hasn't been fused with a preheader block due to
1818  // crazy branches. If it has, we need to start with the header at the top to
1819  // prevent pulling the preheader into the loop body.
1820  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
1821  if (!LoopBlockSet.count(*HeaderChain.begin()))
1822  return L.getHeader();
1823 
1824  LLVM_DEBUG(dbgs() << "Finding best loop top for: "
1825  << getBlockName(L.getHeader()) << "\n");
1826 
1827  BlockFrequency BestPredFreq;
1828  MachineBasicBlock *BestPred = nullptr;
1829  for (MachineBasicBlock *Pred : L.getHeader()->predecessors()) {
1830  if (!LoopBlockSet.count(Pred))
1831  continue;
1832  LLVM_DEBUG(dbgs() << " header pred: " << getBlockName(Pred) << ", has "
1833  << Pred->succ_size() << " successors, ";
1834  MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
1835  if (Pred->succ_size() > 1)
1836  continue;
1837 
1838  if (!canMoveBottomBlockToTop(Pred, L.getHeader()))
1839  continue;
1840 
1841  BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
1842  if (!BestPred || PredFreq > BestPredFreq ||
1843  (!(PredFreq < BestPredFreq) &&
1844  Pred->isLayoutSuccessor(L.getHeader()))) {
1845  BestPred = Pred;
1846  BestPredFreq = PredFreq;
1847  }
1848  }
1849 
1850  // If no direct predecessor is fine, just use the loop header.
1851  if (!BestPred) {
1852  LLVM_DEBUG(dbgs() << " final top unchanged\n");
1853  return L.getHeader();
1854  }
1855 
1856  // Walk backwards through any straight line of predecessors.
1857  while (BestPred->pred_size() == 1 &&
1858  (*BestPred->pred_begin())->succ_size() == 1 &&
1859  *BestPred->pred_begin() != L.getHeader())
1860  BestPred = *BestPred->pred_begin();
1861 
1862  LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
1863  return BestPred;
1864 }
1865 
1866 /// Find the best loop exiting block for layout.
1867 ///
1868 /// This routine implements the logic to analyze the loop looking for the best
1869 /// block to layout at the top of the loop. Typically this is done to maximize
1870 /// fallthrough opportunities.
1872 MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
1873  const BlockFilterSet &LoopBlockSet) {
1874  // We don't want to layout the loop linearly in all cases. If the loop header
1875  // is just a normal basic block in the loop, we want to look for what block
1876  // within the loop is the best one to layout at the top. However, if the loop
1877  // header has be pre-merged into a chain due to predecessors not having
1878  // analyzable branches, *and* the predecessor it is merged with is *not* part
1879  // of the loop, rotating the header into the middle of the loop will create
1880  // a non-contiguous range of blocks which is Very Bad. So start with the
1881  // header and only rotate if safe.
1882  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
1883  if (!LoopBlockSet.count(*HeaderChain.begin()))
1884  return nullptr;
1885 
1886  BlockFrequency BestExitEdgeFreq;
1887  unsigned BestExitLoopDepth = 0;
1888  MachineBasicBlock *ExitingBB = nullptr;
1889  // If there are exits to outer loops, loop rotation can severely limit
1890  // fallthrough opportunities unless it selects such an exit. Keep a set of
1891  // blocks where rotating to exit with that block will reach an outer loop.
1892  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
1893 
1894  LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
1895  << getBlockName(L.getHeader()) << "\n");
1896  for (MachineBasicBlock *MBB : L.getBlocks()) {
1897  BlockChain &Chain = *BlockToChain[MBB];
1898  // Ensure that this block is at the end of a chain; otherwise it could be
1899  // mid-way through an inner loop or a successor of an unanalyzable branch.
1900  if (MBB != *std::prev(Chain.end()))
1901  continue;
1902 
1903  // Now walk the successors. We need to establish whether this has a viable
1904  // exiting successor and whether it has a viable non-exiting successor.
1905  // We store the old exiting state and restore it if a viable looping
1906  // successor isn't found.
1907  MachineBasicBlock *OldExitingBB = ExitingBB;
1908  BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
1909  bool HasLoopingSucc = false;
1910  for (MachineBasicBlock *Succ : MBB->successors()) {
1911  if (Succ->isEHPad())
1912  continue;
1913  if (Succ == MBB)
1914  continue;
1915  BlockChain &SuccChain = *BlockToChain[Succ];
1916  // Don't split chains, either this chain or the successor's chain.
1917  if (&Chain == &SuccChain) {
1918  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
1919  << getBlockName(Succ) << " (chain conflict)\n");
1920  continue;
1921  }
1922 
1923  auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
1924  if (LoopBlockSet.count(Succ)) {
1925  LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
1926  << getBlockName(Succ) << " (" << SuccProb << ")\n");
1927  HasLoopingSucc = true;
1928  continue;
1929  }
1930 
1931  unsigned SuccLoopDepth = 0;
1932  if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
1933  SuccLoopDepth = ExitLoop->getLoopDepth();
1934  if (ExitLoop->contains(&L))
1935  BlocksExitingToOuterLoop.insert(MBB);
1936  }
1937 
1938  BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
1939  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
1940  << getBlockName(Succ) << " [L:" << SuccLoopDepth
1941  << "] (";
1942  MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
1943  // Note that we bias this toward an existing layout successor to retain
1944  // incoming order in the absence of better information. The exit must have
1945  // a frequency higher than the current exit before we consider breaking
1946  // the layout.
1947  BranchProbability Bias(100 - ExitBlockBias, 100);
1948  if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
1949  ExitEdgeFreq > BestExitEdgeFreq ||
1950  (MBB->isLayoutSuccessor(Succ) &&
1951  !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
1952  BestExitEdgeFreq = ExitEdgeFreq;
1953  ExitingBB = MBB;
1954  }
1955  }
1956 
1957  if (!HasLoopingSucc) {
1958  // Restore the old exiting state, no viable looping successor was found.
1959  ExitingBB = OldExitingBB;
1960  BestExitEdgeFreq = OldBestExitEdgeFreq;
1961  }
1962  }
1963  // Without a candidate exiting block or with only a single block in the
1964  // loop, just use the loop header to layout the loop.
1965  if (!ExitingBB) {
1966  LLVM_DEBUG(
1967  dbgs() << " No other candidate exit blocks, using loop header\n");
1968  return nullptr;
1969  }
1970  if (L.getNumBlocks() == 1) {
1971  LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
1972  return nullptr;
1973  }
1974 
1975  // Also, if we have exit blocks which lead to outer loops but didn't select
1976  // one of them as the exiting block we are rotating toward, disable loop
1977  // rotation altogether.
1978  if (!BlocksExitingToOuterLoop.empty() &&
1979  !BlocksExitingToOuterLoop.count(ExitingBB))
1980  return nullptr;
1981 
1982  LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
1983  << "\n");
1984  return ExitingBB;
1985 }
1986 
1987 /// Attempt to rotate an exiting block to the bottom of the loop.
1988 ///
1989 /// Once we have built a chain, try to rotate it to line up the hot exit block
1990 /// with fallthrough out of the loop if doing so doesn't introduce unnecessary
1991 /// branches. For example, if the loop has fallthrough into its header and out
1992 /// of its bottom already, don't rotate it.
1993 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
1994  const MachineBasicBlock *ExitingBB,
1995  const BlockFilterSet &LoopBlockSet) {
1996  if (!ExitingBB)
1997  return;
1998 
1999  MachineBasicBlock *Top = *LoopChain.begin();
2000  MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2001 
2002  // If ExitingBB is already the last one in a chain then nothing to do.
2003  if (Bottom == ExitingBB)
2004  return;
2005 
2006  bool ViableTopFallthrough = false;
2007  for (MachineBasicBlock *Pred : Top->predecessors()) {
2008  BlockChain *PredChain = BlockToChain[Pred];
2009  if (!LoopBlockSet.count(Pred) &&
2010  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2011  ViableTopFallthrough = true;
2012  break;
2013  }
2014  }
2015 
2016  // If the header has viable fallthrough, check whether the current loop
2017  // bottom is a viable exiting block. If so, bail out as rotating will
2018  // introduce an unnecessary branch.
2019  if (ViableTopFallthrough) {
2020  for (MachineBasicBlock *Succ : Bottom->successors()) {
2021  BlockChain *SuccChain = BlockToChain[Succ];
2022  if (!LoopBlockSet.count(Succ) &&
2023  (!SuccChain || Succ == *SuccChain->begin()))
2024  return;
2025  }
2026  }
2027 
2028  BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2029  if (ExitIt == LoopChain.end())
2030  return;
2031 
2032  // Rotating a loop exit to the bottom when there is a fallthrough to top
2033  // trades the entry fallthrough for an exit fallthrough.
2034  // If there is no bottom->top edge, but the chosen exit block does have
2035  // a fallthrough, we break that fallthrough for nothing in return.
2036 
2037  // Let's consider an example. We have a built chain of basic blocks
2038  // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2039  // By doing a rotation we get
2040  // Bk+1, ..., Bn, B1, ..., Bk
2041  // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2042  // If we had a fallthrough Bk -> Bk+1 it is broken now.
2043  // It might be compensated by fallthrough Bn -> B1.
2044  // So we have a condition to avoid creation of extra branch by loop rotation.
2045  // All below must be true to avoid loop rotation:
2046  // If there is a fallthrough to top (B1)
2047  // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2048  // There is no fallthrough from bottom (Bn) to top (B1).
2049  // Please note that there is no exit fallthrough from Bn because we checked it
2050  // above.
2051  if (ViableTopFallthrough) {
2052  assert(std::next(ExitIt) != LoopChain.end() &&
2053  "Exit should not be last BB");
2054  MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2055  if (ExitingBB->isSuccessor(NextBlockInChain))
2056  if (!Bottom->isSuccessor(Top))
2057  return;
2058  }
2059 
2060  LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2061  << " at bottom\n");
2062  std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2063 }
2064 
2065 /// Attempt to rotate a loop based on profile data to reduce branch cost.
2066 ///
2067 /// With profile data, we can determine the cost in terms of missed fall through
2068 /// opportunities when rotating a loop chain and select the best rotation.
2069 /// Basically, there are three kinds of cost to consider for each rotation:
2070 /// 1. The possibly missed fall through edge (if it exists) from BB out of
2071 /// the loop to the loop header.
2072 /// 2. The possibly missed fall through edges (if they exist) from the loop
2073 /// exits to BB out of the loop.
2074 /// 3. The missed fall through edge (if it exists) from the last BB to the
2075 /// first BB in the loop chain.
2076 /// Therefore, the cost for a given rotation is the sum of costs listed above.
2077 /// We select the best rotation with the smallest cost.
2078 void MachineBlockPlacement::rotateLoopWithProfile(
2079  BlockChain &LoopChain, const MachineLoop &L,
2080  const BlockFilterSet &LoopBlockSet) {
2081  auto HeaderBB = L.getHeader();
2082  auto HeaderIter = llvm::find(LoopChain, HeaderBB);
2083  auto RotationPos = LoopChain.end();
2084 
2085  BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
2086 
2087  // A utility lambda that scales up a block frequency by dividing it by a
2088  // branch probability which is the reciprocal of the scale.
2089  auto ScaleBlockFrequency = [](BlockFrequency Freq,
2090  unsigned Scale) -> BlockFrequency {
2091  if (Scale == 0)
2092  return 0;
2093  // Use operator / between BlockFrequency and BranchProbability to implement
2094  // saturating multiplication.
2095  return Freq / BranchProbability(1, Scale);
2096  };
2097 
2098  // Compute the cost of the missed fall-through edge to the loop header if the
2099  // chain head is not the loop header. As we only consider natural loops with
2100  // single header, this computation can be done only once.
2101  BlockFrequency HeaderFallThroughCost(0);
2102  for (auto *Pred : HeaderBB->predecessors()) {
2103  BlockChain *PredChain = BlockToChain[Pred];
2104  if (!LoopBlockSet.count(Pred) &&
2105  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2106  auto EdgeFreq =
2107  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB);
2108  auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2109  // If the predecessor has only an unconditional jump to the header, we
2110  // need to consider the cost of this jump.
2111  if (Pred->succ_size() == 1)
2112  FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2113  HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2114  }
2115  }
2116 
2117  // Here we collect all exit blocks in the loop, and for each exit we find out
2118  // its hottest exit edge. For each loop rotation, we define the loop exit cost
2119  // as the sum of frequencies of exit edges we collect here, excluding the exit
2120  // edge from the tail of the loop chain.
2122  for (auto BB : LoopChain) {
2123  auto LargestExitEdgeProb = BranchProbability::getZero();
2124  for (auto *Succ : BB->successors()) {
2125  BlockChain *SuccChain = BlockToChain[Succ];
2126  if (!LoopBlockSet.count(Succ) &&
2127  (!SuccChain || Succ == *SuccChain->begin())) {
2128  auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2129  LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2130  }
2131  }
2132  if (LargestExitEdgeProb > BranchProbability::getZero()) {
2133  auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2134  ExitsWithFreq.emplace_back(BB, ExitFreq);
2135  }
2136  }
2137 
2138  // In this loop we iterate every block in the loop chain and calculate the
2139  // cost assuming the block is the head of the loop chain. When the loop ends,
2140  // we should have found the best candidate as the loop chain's head.
2141  for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2142  EndIter = LoopChain.end();
2143  Iter != EndIter; Iter++, TailIter++) {
2144  // TailIter is used to track the tail of the loop chain if the block we are
2145  // checking (pointed by Iter) is the head of the chain.
2146  if (TailIter == LoopChain.end())
2147  TailIter = LoopChain.begin();
2148 
2149  auto TailBB = *TailIter;
2150 
2151  // Calculate the cost by putting this BB to the top.
2152  BlockFrequency Cost = 0;
2153 
2154  // If the current BB is the loop header, we need to take into account the
2155  // cost of the missed fall through edge from outside of the loop to the
2156  // header.
2157  if (Iter != HeaderIter)
2158  Cost += HeaderFallThroughCost;
2159 
2160  // Collect the loop exit cost by summing up frequencies of all exit edges
2161  // except the one from the chain tail.
2162  for (auto &ExitWithFreq : ExitsWithFreq)
2163  if (TailBB != ExitWithFreq.first)
2164  Cost += ExitWithFreq.second;
2165 
2166  // The cost of breaking the once fall-through edge from the tail to the top
2167  // of the loop chain. Here we need to consider three cases:
2168  // 1. If the tail node has only one successor, then we will get an
2169  // additional jmp instruction. So the cost here is (MisfetchCost +
2170  // JumpInstCost) * tail node frequency.
2171  // 2. If the tail node has two successors, then we may still get an
2172  // additional jmp instruction if the layout successor after the loop
2173  // chain is not its CFG successor. Note that the more frequently executed
2174  // jmp instruction will be put ahead of the other one. Assume the
2175  // frequency of those two branches are x and y, where x is the frequency
2176  // of the edge to the chain head, then the cost will be
2177  // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2178  // 3. If the tail node has more than two successors (this rarely happens),
2179  // we won't consider any additional cost.
2180  if (TailBB->isSuccessor(*Iter)) {
2181  auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2182  if (TailBB->succ_size() == 1)
2183  Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
2185  else if (TailBB->succ_size() == 2) {
2186  auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2187  auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2188  auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2189  ? TailBBFreq * TailToHeadProb.getCompl()
2190  : TailToHeadFreq;
2191  Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2192  ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2193  }
2194  }
2195 
2196  LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2197  << getBlockName(*Iter)
2198  << " to the top: " << Cost.getFrequency() << "\n");
2199 
2200  if (Cost < SmallestRotationCost) {
2201  SmallestRotationCost = Cost;
2202  RotationPos = Iter;
2203  }
2204  }
2205 
2206  if (RotationPos != LoopChain.end()) {
2207  LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2208  << " to the top\n");
2209  std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2210  }
2211 }
2212 
2213 /// Collect blocks in the given loop that are to be placed.
2214 ///
2215 /// When profile data is available, exclude cold blocks from the returned set;
2216 /// otherwise, collect all blocks in the loop.
2218 MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2219  BlockFilterSet LoopBlockSet;
2220 
2221  // Filter cold blocks off from LoopBlockSet when profile data is available.
2222  // Collect the sum of frequencies of incoming edges to the loop header from
2223  // outside. If we treat the loop as a super block, this is the frequency of
2224  // the loop. Then for each block in the loop, we calculate the ratio between
2225  // its frequency and the frequency of the loop block. When it is too small,
2226  // don't add it to the loop chain. If there are outer loops, then this block
2227  // will be merged into the first outer loop chain for which this block is not
2228  // cold anymore. This needs precise profile data and we only do this when
2229  // profile data is available.
2231  BlockFrequency LoopFreq(0);
2232  for (auto LoopPred : L.getHeader()->predecessors())
2233  if (!L.contains(LoopPred))
2234  LoopFreq += MBFI->getBlockFreq(LoopPred) *
2235  MBPI->getEdgeProbability(LoopPred, L.getHeader());
2236 
2237  for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2238  auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2239  if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2240  continue;
2241  LoopBlockSet.insert(LoopBB);
2242  }
2243  } else
2244  LoopBlockSet.insert(L.block_begin(), L.block_end());
2245 
2246  return LoopBlockSet;
2247 }
2248 
2249 /// Forms basic block chains from the natural loop structures.
2250 ///
2251 /// These chains are designed to preserve the existing *structure* of the code
2252 /// as much as possible. We can then stitch the chains together in a way which
2253 /// both preserves the topological structure and minimizes taken conditional
2254 /// branches.
2255 void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2256  // First recurse through any nested loops, building chains for those inner
2257  // loops.
2258  for (const MachineLoop *InnerLoop : L)
2259  buildLoopChains(*InnerLoop);
2260 
2261  assert(BlockWorkList.empty() &&
2262  "BlockWorkList not empty when starting to build loop chains.");
2263  assert(EHPadWorkList.empty() &&
2264  "EHPadWorkList not empty when starting to build loop chains.");
2265  BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2266 
2267  // Check if we have profile data for this function. If yes, we will rotate
2268  // this loop by modeling costs more precisely which requires the profile data
2269  // for better layout.
2270  bool RotateLoopWithProfile =
2273 
2274  // First check to see if there is an obviously preferable top block for the
2275  // loop. This will default to the header, but may end up as one of the
2276  // predecessors to the header if there is one which will result in strictly
2277  // fewer branches in the loop body.
2278  // When we use profile data to rotate the loop, this is unnecessary.
2279  MachineBasicBlock *LoopTop =
2280  RotateLoopWithProfile ? L.getHeader() : findBestLoopTop(L, LoopBlockSet);
2281 
2282  // If we selected just the header for the loop top, look for a potentially
2283  // profitable exit block in the event that rotating the loop can eliminate
2284  // branches by placing an exit edge at the bottom.
2285  //
2286  // Loops are processed innermost to uttermost, make sure we clear
2287  // PreferredLoopExit before processing a new loop.
2288  PreferredLoopExit = nullptr;
2289  if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2290  PreferredLoopExit = findBestLoopExit(L, LoopBlockSet);
2291 
2292  BlockChain &LoopChain = *BlockToChain[LoopTop];
2293 
2294  // FIXME: This is a really lame way of walking the chains in the loop: we
2295  // walk the blocks, and use a set to prevent visiting a particular chain
2296  // twice.
2297  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2298  assert(LoopChain.UnscheduledPredecessors == 0 &&
2299  "LoopChain should not have unscheduled predecessors.");
2300  UpdatedPreds.insert(&LoopChain);
2301 
2302  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2303  fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2304 
2305  buildChain(LoopTop, LoopChain, &LoopBlockSet);
2306 
2307  if (RotateLoopWithProfile)
2308  rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2309  else
2310  rotateLoop(LoopChain, PreferredLoopExit, LoopBlockSet);
2311 
2312  LLVM_DEBUG({
2313  // Crash at the end so we get all of the debugging output first.
2314  bool BadLoop = false;
2315  if (LoopChain.UnscheduledPredecessors) {
2316  BadLoop = true;
2317  dbgs() << "Loop chain contains a block without its preds placed!\n"
2318  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2319  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2320  }
2321  for (MachineBasicBlock *ChainBB : LoopChain) {
2322  dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2323  if (!LoopBlockSet.remove(ChainBB)) {
2324  // We don't mark the loop as bad here because there are real situations
2325  // where this can occur. For example, with an unanalyzable fallthrough
2326  // from a loop block to a non-loop block or vice versa.
2327  dbgs() << "Loop chain contains a block not contained by the loop!\n"
2328  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2329  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2330  << " Bad block: " << getBlockName(ChainBB) << "\n";
2331  }
2332  }
2333 
2334  if (!LoopBlockSet.empty()) {
2335  BadLoop = true;
2336  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2337  dbgs() << "Loop contains blocks never placed into a chain!\n"
2338  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2339  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2340  << " Bad block: " << getBlockName(LoopBB) << "\n";
2341  }
2342  assert(!BadLoop && "Detected problems with the placement of this loop.");
2343  });
2344 
2345  BlockWorkList.clear();
2346  EHPadWorkList.clear();
2347 }
2348 
2349 void MachineBlockPlacement::buildCFGChains() {
2350  // Ensure that every BB in the function has an associated chain to simplify
2351  // the assumptions of the remaining algorithm.
2352  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2353  for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2354  ++FI) {
2355  MachineBasicBlock *BB = &*FI;
2356  BlockChain *Chain =
2357  new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2358  // Also, merge any blocks which we cannot reason about and must preserve
2359  // the exact fallthrough behavior for.
2360  while (true) {
2361  Cond.clear();
2362  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2363  if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2364  break;
2365 
2366  MachineFunction::iterator NextFI = std::next(FI);
2367  MachineBasicBlock *NextBB = &*NextFI;
2368  // Ensure that the layout successor is a viable block, as we know that
2369  // fallthrough is a possibility.
2370  assert(NextFI != FE && "Can't fallthrough past the last block.");
2371  LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2372  << getBlockName(BB) << " -> " << getBlockName(NextBB)
2373  << "\n");
2374  Chain->merge(NextBB, nullptr);
2375 #ifndef NDEBUG
2376  BlocksWithUnanalyzableExits.insert(&*BB);
2377 #endif
2378  FI = NextFI;
2379  BB = NextBB;
2380  }
2381  }
2382 
2383  // Build any loop-based chains.
2384  PreferredLoopExit = nullptr;
2385  for (MachineLoop *L : *MLI)
2386  buildLoopChains(*L);
2387 
2388  assert(BlockWorkList.empty() &&
2389  "BlockWorkList should be empty before building final chain.");
2390  assert(EHPadWorkList.empty() &&
2391  "EHPadWorkList should be empty before building final chain.");
2392 
2393  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2394  for (MachineBasicBlock &MBB : *F)
2395  fillWorkLists(&MBB, UpdatedPreds);
2396 
2397  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2398  buildChain(&F->front(), FunctionChain);
2399 
2400 #ifndef NDEBUG
2401  using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2402 #endif
2403  LLVM_DEBUG({
2404  // Crash at the end so we get all of the debugging output first.
2405  bool BadFunc = false;
2406  FunctionBlockSetType FunctionBlockSet;
2407  for (MachineBasicBlock &MBB : *F)
2408  FunctionBlockSet.insert(&MBB);
2409 
2410  for (MachineBasicBlock *ChainBB : FunctionChain)
2411  if (!FunctionBlockSet.erase(ChainBB)) {
2412  BadFunc = true;
2413  dbgs() << "Function chain contains a block not in the function!\n"
2414  << " Bad block: " << getBlockName(ChainBB) << "\n";
2415  }
2416 
2417  if (!FunctionBlockSet.empty()) {
2418  BadFunc = true;
2419  for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2420  dbgs() << "Function contains blocks never placed into a chain!\n"
2421  << " Bad block: " << getBlockName(RemainingBB) << "\n";
2422  }
2423  assert(!BadFunc && "Detected problems with the block placement.");
2424  });
2425 
2426  // Splice the blocks into place.
2427  MachineFunction::iterator InsertPos = F->begin();
2428  LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2429  for (MachineBasicBlock *ChainBB : FunctionChain) {
2430  LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2431  : " ... ")
2432  << getBlockName(ChainBB) << "\n");
2433  if (InsertPos != MachineFunction::iterator(ChainBB))
2434  F->splice(InsertPos, ChainBB);
2435  else
2436  ++InsertPos;
2437 
2438  // Update the terminator of the previous block.
2439  if (ChainBB == *FunctionChain.begin())
2440  continue;
2441  MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2442 
2443  // FIXME: It would be awesome of updateTerminator would just return rather
2444  // than assert when the branch cannot be analyzed in order to remove this
2445  // boiler plate.
2446  Cond.clear();
2447  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2448 
2449 #ifndef NDEBUG
2450  if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2451  // Given the exact block placement we chose, we may actually not _need_ to
2452  // be able to edit PrevBB's terminator sequence, but not being _able_ to
2453  // do that at this point is a bug.
2454  assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2455  !PrevBB->canFallThrough()) &&
2456  "Unexpected block with un-analyzable fallthrough!");
2457  Cond.clear();
2458  TBB = FBB = nullptr;
2459  }
2460 #endif
2461 
2462  // The "PrevBB" is not yet updated to reflect current code layout, so,
2463  // o. it may fall-through to a block without explicit "goto" instruction
2464  // before layout, and no longer fall-through it after layout; or
2465  // o. just opposite.
2466  //
2467  // analyzeBranch() may return erroneous value for FBB when these two
2468  // situations take place. For the first scenario FBB is mistakenly set NULL;
2469  // for the 2nd scenario, the FBB, which is expected to be NULL, is
2470  // mistakenly pointing to "*BI".
2471  // Thus, if the future change needs to use FBB before the layout is set, it
2472  // has to correct FBB first by using the code similar to the following:
2473  //
2474  // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2475  // PrevBB->updateTerminator();
2476  // Cond.clear();
2477  // TBB = FBB = nullptr;
2478  // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2479  // // FIXME: This should never take place.
2480  // TBB = FBB = nullptr;
2481  // }
2482  // }
2483  if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond))
2484  PrevBB->updateTerminator();
2485  }
2486 
2487  // Fixup the last block.
2488  Cond.clear();
2489  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2490  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond))
2491  F->back().updateTerminator();
2492 
2493  BlockWorkList.clear();
2494  EHPadWorkList.clear();
2495 }
2496 
2497 void MachineBlockPlacement::optimizeBranches() {
2498  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2499  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2500 
2501  // Now that all the basic blocks in the chain have the proper layout,
2502  // make a final call to AnalyzeBranch with AllowModify set.
2503  // Indeed, the target may be able to optimize the branches in a way we
2504  // cannot because all branches may not be analyzable.
2505  // E.g., the target may be able to remove an unconditional branch to
2506  // a fallthrough when it occurs after predicated terminators.
2507  for (MachineBasicBlock *ChainBB : FunctionChain) {
2508  Cond.clear();
2509  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2510  if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
2511  // If PrevBB has a two-way branch, try to re-order the branches
2512  // such that we branch to the successor with higher probability first.
2513  if (TBB && !Cond.empty() && FBB &&
2514  MBPI->getEdgeProbability(ChainBB, FBB) >
2515  MBPI->getEdgeProbability(ChainBB, TBB) &&
2516  !TII->reverseBranchCondition(Cond)) {
2517  LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2518  << getBlockName(ChainBB) << "\n");
2519  LLVM_DEBUG(dbgs() << " Edge probability: "
2520  << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
2521  << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
2522  DebugLoc dl; // FIXME: this is nowhere
2523  TII->removeBranch(*ChainBB);
2524  TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
2525  ChainBB->updateTerminator();
2526  }
2527  }
2528  }
2529 }
2530 
2531 void MachineBlockPlacement::alignBlocks() {
2532  // Walk through the backedges of the function now that we have fully laid out
2533  // the basic blocks and align the destination of each backedge. We don't rely
2534  // exclusively on the loop info here so that we can align backedges in
2535  // unnatural CFGs and backedges that were introduced purely because of the
2536  // loop rotations done during this layout pass.
2537  if (F->getFunction().optForMinSize() ||
2538  (F->getFunction().optForSize() && !TLI->alignLoopsWithOptSize()))
2539  return;
2540  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2541  if (FunctionChain.begin() == FunctionChain.end())
2542  return; // Empty chain.
2543 
2544  const BranchProbability ColdProb(1, 5); // 20%
2545  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2546  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2547  for (MachineBasicBlock *ChainBB : FunctionChain) {
2548  if (ChainBB == *FunctionChain.begin())
2549  continue;
2550 
2551  // Don't align non-looping basic blocks. These are unlikely to execute
2552  // enough times to matter in practice. Note that we'll still handle
2553  // unnatural CFGs inside of a natural outer loop (the common case) and
2554  // rotated loops.
2555  MachineLoop *L = MLI->getLoopFor(ChainBB);
2556  if (!L)
2557  continue;
2558 
2559  unsigned Align = TLI->getPrefLoopAlignment(L);
2560  if (!Align)
2561  continue; // Don't care about loop alignment.
2562 
2563  // If the block is cold relative to the function entry don't waste space
2564  // aligning it.
2565  BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
2566  if (Freq < WeightedEntryFreq)
2567  continue;
2568 
2569  // If the block is cold relative to its loop header, don't align it
2570  // regardless of what edges into the block exist.
2571  MachineBasicBlock *LoopHeader = L->getHeader();
2572  BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
2573  if (Freq < (LoopHeaderFreq * ColdProb))
2574  continue;
2575 
2576  // Check for the existence of a non-layout predecessor which would benefit
2577  // from aligning this block.
2578  MachineBasicBlock *LayoutPred =
2579  &*std::prev(MachineFunction::iterator(ChainBB));
2580 
2581  // Force alignment if all the predecessors are jumps. We already checked
2582  // that the block isn't cold above.
2583  if (!LayoutPred->isSuccessor(ChainBB)) {
2584  ChainBB->setAlignment(Align);
2585  continue;
2586  }
2587 
2588  // Align this block if the layout predecessor's edge into this block is
2589  // cold relative to the block. When this is true, other predecessors make up
2590  // all of the hot entries into the block and thus alignment is likely to be
2591  // important.
2592  BranchProbability LayoutProb =
2593  MBPI->getEdgeProbability(LayoutPred, ChainBB);
2594  BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
2595  if (LayoutEdgeFreq <= (Freq * ColdProb))
2596  ChainBB->setAlignment(Align);
2597  }
2598 }
2599 
2600 /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
2601 /// it was duplicated into its chain predecessor and removed.
2602 /// \p BB - Basic block that may be duplicated.
2603 ///
2604 /// \p LPred - Chosen layout predecessor of \p BB.
2605 /// Updated to be the chain end if LPred is removed.
2606 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2607 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2608 /// Used to identify which blocks to update predecessor
2609 /// counts.
2610 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2611 /// chosen in the given order due to unnatural CFG
2612 /// only needed if \p BB is removed and
2613 /// \p PrevUnplacedBlockIt pointed to \p BB.
2614 /// @return true if \p BB was removed.
2615 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
2616  MachineBasicBlock *BB, MachineBasicBlock *&LPred,
2617  const MachineBasicBlock *LoopHeaderBB,
2618  BlockChain &Chain, BlockFilterSet *BlockFilter,
2619  MachineFunction::iterator &PrevUnplacedBlockIt) {
2620  bool Removed, DuplicatedToLPred;
2621  bool DuplicatedToOriginalLPred;
2622  Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
2623  PrevUnplacedBlockIt,
2624  DuplicatedToLPred);
2625  if (!Removed)
2626  return false;
2627  DuplicatedToOriginalLPred = DuplicatedToLPred;
2628  // Iteratively try to duplicate again. It can happen that a block that is
2629  // duplicated into is still small enough to be duplicated again.
2630  // No need to call markBlockSuccessors in this case, as the blocks being
2631  // duplicated from here on are already scheduled.
2632  // Note that DuplicatedToLPred always implies Removed.
2633  while (DuplicatedToLPred) {
2634  assert(Removed && "Block must have been removed to be duplicated into its "
2635  "layout predecessor.");
2636  MachineBasicBlock *DupBB, *DupPred;
2637  // The removal callback causes Chain.end() to be updated when a block is
2638  // removed. On the first pass through the loop, the chain end should be the
2639  // same as it was on function entry. On subsequent passes, because we are
2640  // duplicating the block at the end of the chain, if it is removed the
2641  // chain will have shrunk by one block.
2642  BlockChain::iterator ChainEnd = Chain.end();
2643  DupBB = *(--ChainEnd);
2644  // Now try to duplicate again.
2645  if (ChainEnd == Chain.begin())
2646  break;
2647  DupPred = *std::prev(ChainEnd);
2648  Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
2649  PrevUnplacedBlockIt,
2650  DuplicatedToLPred);
2651  }
2652  // If BB was duplicated into LPred, it is now scheduled. But because it was
2653  // removed, markChainSuccessors won't be called for its chain. Instead we
2654  // call markBlockSuccessors for LPred to achieve the same effect. This must go
2655  // at the end because repeating the tail duplication can increase the number
2656  // of unscheduled predecessors.
2657  LPred = *std::prev(Chain.end());
2658  if (DuplicatedToOriginalLPred)
2659  markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
2660  return true;
2661 }
2662 
2663 /// Tail duplicate \p BB into (some) predecessors if profitable.
2664 /// \p BB - Basic block that may be duplicated
2665 /// \p LPred - Chosen layout predecessor of \p BB
2666 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2667 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2668 /// Used to identify which blocks to update predecessor
2669 /// counts.
2670 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2671 /// chosen in the given order due to unnatural CFG
2672 /// only needed if \p BB is removed and
2673 /// \p PrevUnplacedBlockIt pointed to \p BB.
2674 /// \p DuplicatedToLPred - True if the block was duplicated into LPred. Will
2675 /// only be true if the block was removed.
2676 /// \return - True if the block was duplicated into all preds and removed.
2677 bool MachineBlockPlacement::maybeTailDuplicateBlock(
2679  BlockChain &Chain, BlockFilterSet *BlockFilter,
2680  MachineFunction::iterator &PrevUnplacedBlockIt,
2681  bool &DuplicatedToLPred) {
2682  DuplicatedToLPred = false;
2683  if (!shouldTailDuplicate(BB))
2684  return false;
2685 
2686  LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
2687  << "\n");
2688 
2689  // This has to be a callback because none of it can be done after
2690  // BB is deleted.
2691  bool Removed = false;
2692  auto RemovalCallback =
2693  [&](MachineBasicBlock *RemBB) {
2694  // Signal to outer function
2695  Removed = true;
2696 
2697  // Conservative default.
2698  bool InWorkList = true;
2699  // Remove from the Chain and Chain Map
2700  if (BlockToChain.count(RemBB)) {
2701  BlockChain *Chain = BlockToChain[RemBB];
2702  InWorkList = Chain->UnscheduledPredecessors == 0;
2703  Chain->remove(RemBB);
2704  BlockToChain.erase(RemBB);
2705  }
2706 
2707  // Handle the unplaced block iterator
2708  if (&(*PrevUnplacedBlockIt) == RemBB) {
2709  PrevUnplacedBlockIt++;
2710  }
2711 
2712  // Handle the Work Lists
2713  if (InWorkList) {
2714  SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
2715  if (RemBB->isEHPad())
2716  RemoveList = EHPadWorkList;
2717  RemoveList.erase(
2718  llvm::remove_if(RemoveList,
2719  [RemBB](MachineBasicBlock *BB) {
2720  return BB == RemBB;
2721  }),
2722  RemoveList.end());
2723  }
2724 
2725  // Handle the filter set
2726  if (BlockFilter) {
2727  BlockFilter->remove(RemBB);
2728  }
2729 
2730  // Remove the block from loop info.
2731  MLI->removeBlock(RemBB);
2732  if (RemBB == PreferredLoopExit)
2733  PreferredLoopExit = nullptr;
2734 
2735  LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
2736  << getBlockName(RemBB) << "\n");
2737  };
2738  auto RemovalCallbackRef =
2739  function_ref<void(MachineBasicBlock*)>(RemovalCallback);
2740 
2741  SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
2742  bool IsSimple = TailDup.isSimpleBB(BB);
2743  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
2744  &DuplicatedPreds, &RemovalCallbackRef);
2745 
2746  // Update UnscheduledPredecessors to reflect tail-duplication.
2747  DuplicatedToLPred = false;
2748  for (MachineBasicBlock *Pred : DuplicatedPreds) {
2749  // We're only looking for unscheduled predecessors that match the filter.
2750  BlockChain* PredChain = BlockToChain[Pred];
2751  if (Pred == LPred)
2752  DuplicatedToLPred = true;
2753  if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
2754  || PredChain == &Chain)
2755  continue;
2756  for (MachineBasicBlock *NewSucc : Pred->successors()) {
2757  if (BlockFilter && !BlockFilter->count(NewSucc))
2758  continue;
2759  BlockChain *NewChain = BlockToChain[NewSucc];
2760  if (NewChain != &Chain && NewChain != PredChain)
2761  NewChain->UnscheduledPredecessors++;
2762  }
2763  }
2764  return Removed;
2765 }
2766 
2767 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
2768  if (skipFunction(MF.getFunction()))
2769  return false;
2770 
2771  // Check for single-block functions and skip them.
2772  if (std::next(MF.begin()) == MF.end())
2773  return false;
2774 
2775  F = &MF;
2776  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
2777  MBFI = llvm::make_unique<BranchFolder::MBFIWrapper>(
2778  getAnalysis<MachineBlockFrequencyInfo>());
2779  MLI = &getAnalysis<MachineLoopInfo>();
2780  TII = MF.getSubtarget().getInstrInfo();
2781  TLI = MF.getSubtarget().getTargetLowering();
2782  MPDT = nullptr;
2783 
2784  // Initialize PreferredLoopExit to nullptr here since it may never be set if
2785  // there are no MachineLoops.
2786  PreferredLoopExit = nullptr;
2787 
2788  assert(BlockToChain.empty() &&
2789  "BlockToChain map should be empty before starting placement.");
2790  assert(ComputedEdges.empty() &&
2791  "Computed Edge map should be empty before starting placement.");
2792 
2793  unsigned TailDupSize = TailDupPlacementThreshold;
2794  // If only the aggressive threshold is explicitly set, use it.
2795  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
2796  TailDupPlacementThreshold.getNumOccurrences() == 0)
2798 
2799  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
2800  // For aggressive optimization, we can adjust some thresholds to be less
2801  // conservative.
2802  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
2803  // At O3 we should be more willing to copy blocks for tail duplication. This
2804  // increases size pressure, so we only do it at O3
2805  // Do this unless only the regular threshold is explicitly set.
2806  if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
2807  TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
2809  }
2810 
2811  if (allowTailDupPlacement()) {
2812  MPDT = &getAnalysis<MachinePostDominatorTree>();
2813  if (MF.getFunction().optForSize())
2814  TailDupSize = 1;
2815  bool PreRegAlloc = false;
2816  TailDup.initMF(MF, PreRegAlloc, MBPI, /* LayoutMode */ true, TailDupSize);
2817  precomputeTriangleChains();
2818  }
2819 
2820  buildCFGChains();
2821 
2822  // Changing the layout can create new tail merging opportunities.
2823  // TailMerge can create jump into if branches that make CFG irreducible for
2824  // HW that requires structured CFG.
2825  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
2826  PassConfig->getEnableTailMerge() &&
2828  // No tail merging opportunities if the block number is less than four.
2829  if (MF.size() > 3 && EnableTailMerge) {
2830  unsigned TailMergeSize = TailDupSize + 1;
2831  BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
2832  *MBPI, TailMergeSize);
2833 
2834  if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
2835  getAnalysisIfAvailable<MachineModuleInfo>(), MLI,
2836  /*AfterBlockPlacement=*/true)) {
2837  // Redo the layout if tail merging creates/removes/moves blocks.
2838  BlockToChain.clear();
2839  ComputedEdges.clear();
2840  // Must redo the post-dominator tree if blocks were changed.
2841  if (MPDT)
2842  MPDT->runOnMachineFunction(MF);
2843  ChainAllocator.DestroyAll();
2844  buildCFGChains();
2845  }
2846  }
2847 
2848  optimizeBranches();
2849  alignBlocks();
2850 
2851  BlockToChain.clear();
2852  ComputedEdges.clear();
2853  ChainAllocator.DestroyAll();
2854 
2855  if (AlignAllBlock)
2856  // Align all of the blocks in the function to a specific alignment.
2857  for (MachineBasicBlock &MBB : MF)
2859  else if (AlignAllNonFallThruBlocks) {
2860  // Align all of the blocks that have no fall-through predecessors to a
2861  // specific alignment.
2862  for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
2863  auto LayoutPred = std::prev(MBI);
2864  if (!LayoutPred->isSuccessor(&*MBI))
2865  MBI->setAlignment(AlignAllNonFallThruBlocks);
2866  }
2867  }
2868  if (ViewBlockLayoutWithBFI != GVDT_None &&
2869  (ViewBlockFreqFuncName.empty() ||
2870  F->getFunction().getName().equals(ViewBlockFreqFuncName))) {
2871  MBFI->view("MBP." + MF.getName(), false);
2872  }
2873 
2874 
2875  // We always return true as we have no way to track whether the final order
2876  // differs from the original order.
2877  return true;
2878 }
2879 
2880 namespace {
2881 
2882 /// A pass to compute block placement statistics.
2883 ///
2884 /// A separate pass to compute interesting statistics for evaluating block
2885 /// placement. This is separate from the actual placement pass so that they can
2886 /// be computed in the absence of any placement transformations or when using
2887 /// alternative placement strategies.
2888 class MachineBlockPlacementStats : public MachineFunctionPass {
2889  /// A handle to the branch probability pass.
2890  const MachineBranchProbabilityInfo *MBPI;
2891 
2892  /// A handle to the function-wide block frequency pass.
2893  const MachineBlockFrequencyInfo *MBFI;
2894 
2895 public:
2896  static char ID; // Pass identification, replacement for typeid
2897 
2898  MachineBlockPlacementStats() : MachineFunctionPass(ID) {
2900  }
2901 
2902  bool runOnMachineFunction(MachineFunction &F) override;
2903 
2904  void getAnalysisUsage(AnalysisUsage &AU) const override {
2907  AU.setPreservesAll();
2909  }
2910 };
2911 
2912 } // end anonymous namespace
2913 
2915 
2917 
2918 INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
2919  "Basic Block Placement Stats", false, false)
2922 INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
2923  "Basic Block Placement Stats", false, false)
2924 
2925 bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
2926  // Check for single-block functions and skip them.
2927  if (std::next(F.begin()) == F.end())
2928  return false;
2929 
2930  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
2931  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
2932 
2933  for (MachineBasicBlock &MBB : F) {
2934  BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
2935  Statistic &NumBranches =
2936  (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
2937  Statistic &BranchTakenFreq =
2938  (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
2939  for (MachineBasicBlock *Succ : MBB.successors()) {
2940  // Skip if this successor is a fallthrough.
2941  if (MBB.isLayoutSuccessor(Succ))
2942  continue;
2943 
2944  BlockFrequency EdgeFreq =
2945  BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
2946  ++NumBranches;
2947  BranchTakenFreq += EdgeFreq.getFrequency();
2948  }
2949  }
2950 
2951  return false;
2952 }
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:258
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
BranchProbability getCompl() const
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:249
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:320
This class represents lattice values for constants.
Definition: AllocatorList.h:23
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:464
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
unsigned size() const
bool requiresStructuredCFG() const
virtual const TargetLowering * getTargetLowering() const
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
virtual unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const
Insert branch code into the end of the specified MachineBasicBlock.
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLExtras.h:116
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
STATISTIC(NumFunctions, "Total number of functions")
A debug info location.
Definition: DebugLoc.h:33
F(f)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static BranchProbability getOne()
virtual unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const
Remove the branching code at the end of the specific MBB.
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
void setAlignment(unsigned Align)
Set alignment of the basic block.
CodeGenOpt::Level getOptLevel() const
This file defines the MallocAllocator and BumpPtrAllocator interfaces.
iterator_range< succ_iterator > successors()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:221
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:479
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, const TargetRegisterInfo *tri, MachineModuleInfo *mmi, MachineLoopInfo *mli=nullptr, bool AfterPlacement=false)
Perhaps branch folding, tail merging and other CFG optimizations on the given function.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:41
void removeBlock(MachineBasicBlock *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
block placement Basic Block Placement Stats
Target-Independent Code Generator Pass Configuration Options.
BlockT * getHeader() const
Definition: LoopInfo.h:99
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:266
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:462
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
virtual const TargetInstrInfo * getInstrInfo() const
cl::opt< std::string > ViewBlockFreqFuncName
auto count(R &&Range, const E &Element) -> typename std::iterator_traits< decltype(adl_begin(Range))>::difference_type
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1251
TargetInstrInfo - Interface to description of machine instruction set.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
cl::opt< unsigned > ProfileLikelyProb
bool getEnableTailMerge() const
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
#define P(N)
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock *> *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:422
cl::opt< GVDAGType > ViewBlockLayoutWithBFI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:290
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
succ_reverse_iterator succ_rbegin()
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:91
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
virtual unsigned getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Represent the analysis usage information of a pass.
bool optForSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:597
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, uint64_t EntryFreq)
Compare 2 BlockFrequency&#39;s with a small penalty for A.
iterator_range< pred_iterator > predecessors()
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:490
auto remove_if(R &&Range, UnaryPredicate P) -> decltype(adl_begin(Range))
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1225
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e...
iterator erase(const_iterator CI)
Definition: SmallVector.h:437
const MachineBasicBlock & front() const
size_t size() const
Definition: SmallVector.h:52
void initializeMachineBlockPlacementPass(PassRegistry &)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1206
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
Branch Probability Basic Block Placement
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
size_type size() const
Definition: SmallPtrSet.h:92
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:109
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:297
Iterator for intrusive lists based on ilist_node.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
loop rotate
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:839
void updateTerminator()
Update the terminator instructions in block to account for changes to the layout. ...
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_END(MachineBlockPlacement
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:441
unsigned pred_size() const
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
void setPreservesAll()
Set by analyses that do not transform their input at all.
typename SuperClass::iterator iterator
Definition: SmallVector.h:319
unsigned succ_size() const
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
cl::opt< unsigned > StaticLikelyProb
LLVM_NODISCARD bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:160
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:162
unsigned succ_size(const Instruction *I)
Definition: CFG.h:256
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all " "blocks in the function."), cl::init(0), cl::Hidden)
static uint64_t getMaxFrequency()
Returns the maximum possible frequency, the saturation value.
void emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:644
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:148
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock *> &Successors)
Check if BB has exactly the successors in Successors.
#define I(x, y, z)
Definition: MD5.cpp:58
bool optForMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:594
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
block_iterator block_end() const
Definition: LoopInfo.h:154
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:171
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:211
virtual bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const
Reverses the branch condition of the specified condition list, returning false on success and true if...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:482
#define DEBUG_TYPE
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
static BranchProbability getZero()
Utility class to perform tail duplication.
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
#define LLVM_DEBUG(X)
Definition: Debug.h:122
for(unsigned i=Desc.getNumOperands(), e=OldMI.getNumOperands();i !=e;++i)
block_iterator block_begin() const
Definition: LoopInfo.h:153
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all " "blocks that have no fall-through predecessors (i.e. don't add " "nops that are executed)."), cl::init(0), cl::Hidden)
bool hasProfileData() const
Return true if the function is annotated with profile data.
Definition: Function.h:307
uint32_t getNumerator() const
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
This file describes how to lower LLVM code to machine code.