LLVM  10.0.0svn
MachineBlockPlacement.cpp
Go to the documentation of this file.
1 //===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements basic block placement transformations using the CFG
10 // structure and branch probability estimates.
11 //
12 // The pass strives to preserve the structure of the CFG (that is, retain
13 // a topological ordering of basic blocks) in the absence of a *strong* signal
14 // to the contrary from probabilities. However, within the CFG structure, it
15 // attempts to choose an ordering which favors placing more likely sequences of
16 // blocks adjacent to each other.
17 //
18 // The algorithm works from the inner-most loop within a function outward, and
19 // at each stage walks through the basic blocks, trying to coalesce them into
20 // sequential chains where allowed by the CFG (or demanded by heavy
21 // probabilities). Finally, it walks the blocks in topological order, and the
22 // first time it reaches a chain of basic blocks, it schedules them in the
23 // function in-order.
24 //
25 //===----------------------------------------------------------------------===//
26 
27 #include "BranchFolding.h"
28 #include "llvm/ADT/ArrayRef.h"
29 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/SetVector.h"
32 #include "llvm/ADT/SmallPtrSet.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
49 #include "llvm/IR/DebugLoc.h"
50 #include "llvm/IR/Function.h"
51 #include "llvm/Pass.h"
52 #include "llvm/Support/Allocator.h"
55 #include "llvm/Support/CodeGen.h"
57 #include "llvm/Support/Compiler.h"
58 #include "llvm/Support/Debug.h"
61 #include <algorithm>
62 #include <cassert>
63 #include <cstdint>
64 #include <iterator>
65 #include <memory>
66 #include <string>
67 #include <tuple>
68 #include <utility>
69 #include <vector>
70 
71 using namespace llvm;
72 
73 #define DEBUG_TYPE "block-placement"
74 
75 STATISTIC(NumCondBranches, "Number of conditional branches");
76 STATISTIC(NumUncondBranches, "Number of unconditional branches");
77 STATISTIC(CondBranchTakenFreq,
78  "Potential frequency of taking conditional branches");
79 STATISTIC(UncondBranchTakenFreq,
80  "Potential frequency of taking unconditional branches");
81 
83  "align-all-blocks",
84  cl::desc("Force the alignment of all blocks in the function in log2 format "
85  "(e.g 4 means align on 16B boundaries)."),
86  cl::init(0), cl::Hidden);
87 
89  "align-all-nofallthru-blocks",
90  cl::desc("Force the alignment of all blocks that have no fall-through "
91  "predecessors (i.e. don't add nops that are executed). In log2 "
92  "format (e.g 4 means align on 16B boundaries)."),
93  cl::init(0), cl::Hidden);
94 
95 // FIXME: Find a good default for this flag and remove the flag.
97  "block-placement-exit-block-bias",
98  cl::desc("Block frequency percentage a loop exit block needs "
99  "over the original exit to be considered the new exit."),
100  cl::init(0), cl::Hidden);
101 
102 // Definition:
103 // - Outlining: placement of a basic block outside the chain or hot path.
104 
106  "loop-to-cold-block-ratio",
107  cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
108  "(frequency of block) is greater than this ratio"),
109  cl::init(5), cl::Hidden);
110 
112  "force-loop-cold-block",
113  cl::desc("Force outlining cold blocks from loops."),
114  cl::init(false), cl::Hidden);
115 
116 static cl::opt<bool>
117  PreciseRotationCost("precise-rotation-cost",
118  cl::desc("Model the cost of loop rotation more "
119  "precisely by using profile data."),
120  cl::init(false), cl::Hidden);
121 
122 static cl::opt<bool>
123  ForcePreciseRotationCost("force-precise-rotation-cost",
124  cl::desc("Force the use of precise cost "
125  "loop rotation strategy."),
126  cl::init(false), cl::Hidden);
127 
129  "misfetch-cost",
130  cl::desc("Cost that models the probabilistic risk of an instruction "
131  "misfetch due to a jump comparing to falling through, whose cost "
132  "is zero."),
133  cl::init(1), cl::Hidden);
134 
135 static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
136  cl::desc("Cost of jump instructions."),
137  cl::init(1), cl::Hidden);
138 static cl::opt<bool>
139 TailDupPlacement("tail-dup-placement",
140  cl::desc("Perform tail duplication during placement. "
141  "Creates more fallthrough opportunites in "
142  "outline branches."),
143  cl::init(true), cl::Hidden);
144 
145 static cl::opt<bool>
146 BranchFoldPlacement("branch-fold-placement",
147  cl::desc("Perform branch folding during placement. "
148  "Reduces code size."),
149  cl::init(true), cl::Hidden);
150 
151 // Heuristic for tail duplication.
153  "tail-dup-placement-threshold",
154  cl::desc("Instruction cutoff for tail duplication during layout. "
155  "Tail merging during layout is forced to have a threshold "
156  "that won't conflict."), cl::init(2),
157  cl::Hidden);
158 
159 // Heuristic for aggressive tail duplication.
161  "tail-dup-placement-aggressive-threshold",
162  cl::desc("Instruction cutoff for aggressive tail duplication during "
163  "layout. Used at -O3. Tail merging during layout is forced to "
164  "have a threshold that won't conflict."), cl::init(4),
165  cl::Hidden);
166 
167 // Heuristic for tail duplication.
169  "tail-dup-placement-penalty",
170  cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
171  "Copying can increase fallthrough, but it also increases icache "
172  "pressure. This parameter controls the penalty to account for that. "
173  "Percent as integer."),
174  cl::init(2),
175  cl::Hidden);
176 
177 // Heuristic for triangle chains.
179  "triangle-chain-count",
180  cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
181  "triangle tail duplication heuristic to kick in. 0 to disable."),
182  cl::init(2),
183  cl::Hidden);
184 
187 
188 // Internal option used to control BFI display only after MBP pass.
189 // Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
190 // -view-block-layout-with-bfi=
192 
193 // Command line option to specify the name of the function for CFG dump
194 // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
196 
197 namespace {
198 
199 class BlockChain;
200 
201 /// Type for our function-wide basic block -> block chain mapping.
202 using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
203 
204 /// A chain of blocks which will be laid out contiguously.
205 ///
206 /// This is the datastructure representing a chain of consecutive blocks that
207 /// are profitable to layout together in order to maximize fallthrough
208 /// probabilities and code locality. We also can use a block chain to represent
209 /// a sequence of basic blocks which have some external (correctness)
210 /// requirement for sequential layout.
211 ///
212 /// Chains can be built around a single basic block and can be merged to grow
213 /// them. They participate in a block-to-chain mapping, which is updated
214 /// automatically as chains are merged together.
215 class BlockChain {
216  /// The sequence of blocks belonging to this chain.
217  ///
218  /// This is the sequence of blocks for a particular chain. These will be laid
219  /// out in-order within the function.
221 
222  /// A handle to the function-wide basic block to block chain mapping.
223  ///
224  /// This is retained in each block chain to simplify the computation of child
225  /// block chains for SCC-formation and iteration. We store the edges to child
226  /// basic blocks, and map them back to their associated chains using this
227  /// structure.
228  BlockToChainMapType &BlockToChain;
229 
230 public:
231  /// Construct a new BlockChain.
232  ///
233  /// This builds a new block chain representing a single basic block in the
234  /// function. It also registers itself as the chain that block participates
235  /// in with the BlockToChain mapping.
236  BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
237  : Blocks(1, BB), BlockToChain(BlockToChain) {
238  assert(BB && "Cannot create a chain with a null basic block");
239  BlockToChain[BB] = this;
240  }
241 
242  /// Iterator over blocks within the chain.
245 
246  /// Beginning of blocks within the chain.
247  iterator begin() { return Blocks.begin(); }
248  const_iterator begin() const { return Blocks.begin(); }
249 
250  /// End of blocks within the chain.
251  iterator end() { return Blocks.end(); }
252  const_iterator end() const { return Blocks.end(); }
253 
254  bool remove(MachineBasicBlock* BB) {
255  for(iterator i = begin(); i != end(); ++i) {
256  if (*i == BB) {
257  Blocks.erase(i);
258  return true;
259  }
260  }
261  return false;
262  }
263 
264  /// Merge a block chain into this one.
265  ///
266  /// This routine merges a block chain into this one. It takes care of forming
267  /// a contiguous sequence of basic blocks, updating the edge list, and
268  /// updating the block -> chain mapping. It does not free or tear down the
269  /// old chain, but the old chain's block list is no longer valid.
270  void merge(MachineBasicBlock *BB, BlockChain *Chain) {
271  assert(BB && "Can't merge a null block.");
272  assert(!Blocks.empty() && "Can't merge into an empty chain.");
273 
274  // Fast path in case we don't have a chain already.
275  if (!Chain) {
276  assert(!BlockToChain[BB] &&
277  "Passed chain is null, but BB has entry in BlockToChain.");
278  Blocks.push_back(BB);
279  BlockToChain[BB] = this;
280  return;
281  }
282 
283  assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
284  assert(Chain->begin() != Chain->end());
285 
286  // Update the incoming blocks to point to this chain, and add them to the
287  // chain structure.
288  for (MachineBasicBlock *ChainBB : *Chain) {
289  Blocks.push_back(ChainBB);
290  assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
291  BlockToChain[ChainBB] = this;
292  }
293  }
294 
295 #ifndef NDEBUG
296  /// Dump the blocks in this chain.
297  LLVM_DUMP_METHOD void dump() {
298  for (MachineBasicBlock *MBB : *this)
299  MBB->dump();
300  }
301 #endif // NDEBUG
302 
303  /// Count of predecessors of any block within the chain which have not
304  /// yet been scheduled. In general, we will delay scheduling this chain
305  /// until those predecessors are scheduled (or we find a sufficiently good
306  /// reason to override this heuristic.) Note that when forming loop chains,
307  /// blocks outside the loop are ignored and treated as if they were already
308  /// scheduled.
309  ///
310  /// Note: This field is reinitialized multiple times - once for each loop,
311  /// and then once for the function as a whole.
312  unsigned UnscheduledPredecessors = 0;
313 };
314 
315 class MachineBlockPlacement : public MachineFunctionPass {
316  /// A type for a block filter set.
317  using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
318 
319  /// Pair struct containing basic block and taildup profitability
320  struct BlockAndTailDupResult {
321  MachineBasicBlock *BB;
322  bool ShouldTailDup;
323  };
324 
325  /// Triple struct containing edge weight and the edge.
326  struct WeightedEdge {
327  BlockFrequency Weight;
328  MachineBasicBlock *Src;
329  MachineBasicBlock *Dest;
330  };
331 
332  /// work lists of blocks that are ready to be laid out
335 
336  /// Edges that have already been computed as optimal.
338 
339  /// Machine Function
341 
342  /// A handle to the branch probability pass.
343  const MachineBranchProbabilityInfo *MBPI;
344 
345  /// A handle to the function-wide block frequency pass.
346  std::unique_ptr<BranchFolder::MBFIWrapper> MBFI;
347 
348  /// A handle to the loop info.
349  MachineLoopInfo *MLI;
350 
351  /// Preferred loop exit.
352  /// Member variable for convenience. It may be removed by duplication deep
353  /// in the call stack.
354  MachineBasicBlock *PreferredLoopExit;
355 
356  /// A handle to the target's instruction info.
357  const TargetInstrInfo *TII;
358 
359  /// A handle to the target's lowering info.
360  const TargetLoweringBase *TLI;
361 
362  /// A handle to the post dominator tree.
364 
365  /// Duplicator used to duplicate tails during placement.
366  ///
367  /// Placement decisions can open up new tail duplication opportunities, but
368  /// since tail duplication affects placement decisions of later blocks, it
369  /// must be done inline.
370  TailDuplicator TailDup;
371 
372  /// Allocator and owner of BlockChain structures.
373  ///
374  /// We build BlockChains lazily while processing the loop structure of
375  /// a function. To reduce malloc traffic, we allocate them using this
376  /// slab-like allocator, and destroy them after the pass completes. An
377  /// important guarantee is that this allocator produces stable pointers to
378  /// the chains.
380 
381  /// Function wide BasicBlock to BlockChain mapping.
382  ///
383  /// This mapping allows efficiently moving from any given basic block to the
384  /// BlockChain it participates in, if any. We use it to, among other things,
385  /// allow implicitly defining edges between chains as the existing edges
386  /// between basic blocks.
388 
389 #ifndef NDEBUG
390  /// The set of basic blocks that have terminators that cannot be fully
391  /// analyzed. These basic blocks cannot be re-ordered safely by
392  /// MachineBlockPlacement, and we must preserve physical layout of these
393  /// blocks and their successors through the pass.
394  SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
395 #endif
396 
397  /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
398  /// if the count goes to 0, add them to the appropriate work list.
399  void markChainSuccessors(
400  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
401  const BlockFilterSet *BlockFilter = nullptr);
402 
403  /// Decrease the UnscheduledPredecessors count for a single block, and
404  /// if the count goes to 0, add them to the appropriate work list.
405  void markBlockSuccessors(
406  const BlockChain &Chain, const MachineBasicBlock *BB,
407  const MachineBasicBlock *LoopHeaderBB,
408  const BlockFilterSet *BlockFilter = nullptr);
409 
411  collectViableSuccessors(
412  const MachineBasicBlock *BB, const BlockChain &Chain,
413  const BlockFilterSet *BlockFilter,
415  bool shouldPredBlockBeOutlined(
416  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
417  const BlockChain &Chain, const BlockFilterSet *BlockFilter,
418  BranchProbability SuccProb, BranchProbability HotProb);
419  bool repeatedlyTailDuplicateBlock(
421  const MachineBasicBlock *LoopHeaderBB,
422  BlockChain &Chain, BlockFilterSet *BlockFilter,
423  MachineFunction::iterator &PrevUnplacedBlockIt);
424  bool maybeTailDuplicateBlock(
426  BlockChain &Chain, BlockFilterSet *BlockFilter,
427  MachineFunction::iterator &PrevUnplacedBlockIt,
428  bool &DuplicatedToLPred);
429  bool hasBetterLayoutPredecessor(
430  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
431  const BlockChain &SuccChain, BranchProbability SuccProb,
432  BranchProbability RealSuccProb, const BlockChain &Chain,
433  const BlockFilterSet *BlockFilter);
434  BlockAndTailDupResult selectBestSuccessor(
435  const MachineBasicBlock *BB, const BlockChain &Chain,
436  const BlockFilterSet *BlockFilter);
437  MachineBasicBlock *selectBestCandidateBlock(
438  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
439  MachineBasicBlock *getFirstUnplacedBlock(
440  const BlockChain &PlacedChain,
441  MachineFunction::iterator &PrevUnplacedBlockIt,
442  const BlockFilterSet *BlockFilter);
443 
444  /// Add a basic block to the work list if it is appropriate.
445  ///
446  /// If the optional parameter BlockFilter is provided, only MBB
447  /// present in the set will be added to the worklist. If nullptr
448  /// is provided, no filtering occurs.
449  void fillWorkLists(const MachineBasicBlock *MBB,
450  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
451  const BlockFilterSet *BlockFilter);
452 
453  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
454  BlockFilterSet *BlockFilter = nullptr);
455  bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
456  const MachineBasicBlock *OldTop);
457  bool hasViableTopFallthrough(const MachineBasicBlock *Top,
458  const BlockFilterSet &LoopBlockSet);
459  BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
460  const BlockFilterSet &LoopBlockSet);
461  BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
462  const MachineBasicBlock *OldTop,
463  const MachineBasicBlock *ExitBB,
464  const BlockFilterSet &LoopBlockSet);
465  MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
466  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
467  MachineBasicBlock *findBestLoopTop(
468  const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
469  MachineBasicBlock *findBestLoopExit(
470  const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
471  BlockFrequency &ExitFreq);
472  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
473  void buildLoopChains(const MachineLoop &L);
474  void rotateLoop(
475  BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
476  BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
477  void rotateLoopWithProfile(
478  BlockChain &LoopChain, const MachineLoop &L,
479  const BlockFilterSet &LoopBlockSet);
480  void buildCFGChains();
481  void optimizeBranches();
482  void alignBlocks();
483  /// Returns true if a block should be tail-duplicated to increase fallthrough
484  /// opportunities.
485  bool shouldTailDuplicate(MachineBasicBlock *BB);
486  /// Check the edge frequencies to see if tail duplication will increase
487  /// fallthroughs.
488  bool isProfitableToTailDup(
489  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
490  BranchProbability QProb,
491  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
492 
493  /// Check for a trellis layout.
494  bool isTrellis(const MachineBasicBlock *BB,
495  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
496  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
497 
498  /// Get the best successor given a trellis layout.
499  BlockAndTailDupResult getBestTrellisSuccessor(
500  const MachineBasicBlock *BB,
501  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
502  BranchProbability AdjustedSumProb, const BlockChain &Chain,
503  const BlockFilterSet *BlockFilter);
504 
505  /// Get the best pair of non-conflicting edges.
506  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
507  const MachineBasicBlock *BB,
509 
510  /// Returns true if a block can tail duplicate into all unplaced
511  /// predecessors. Filters based on loop.
512  bool canTailDuplicateUnplacedPreds(
513  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
514  const BlockChain &Chain, const BlockFilterSet *BlockFilter);
515 
516  /// Find chains of triangles to tail-duplicate where a global analysis works,
517  /// but a local analysis would not find them.
518  void precomputeTriangleChains();
519 
520 public:
521  static char ID; // Pass identification, replacement for typeid
522 
523  MachineBlockPlacement() : MachineFunctionPass(ID) {
525  }
526 
527  bool runOnMachineFunction(MachineFunction &F) override;
528 
529  bool allowTailDupPlacement() const {
530  assert(F);
532  }
533 
534  void getAnalysisUsage(AnalysisUsage &AU) const override {
537  if (TailDupPlacement)
542  }
543 };
544 
545 } // end anonymous namespace
546 
548 
550 
551 INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
552  "Branch Probability Basic Block Placement", false, false)
557 INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
558  "Branch Probability Basic Block Placement", false, false)
559 
560 #ifndef NDEBUG
561 /// Helper to print the name of a MBB.
562 ///
563 /// Only used by debug logging.
564 static std::string getBlockName(const MachineBasicBlock *BB) {
565  std::string Result;
566  raw_string_ostream OS(Result);
567  OS << printMBBReference(*BB);
568  OS << " ('" << BB->getName() << "')";
569  OS.flush();
570  return Result;
571 }
572 #endif
573 
574 /// Mark a chain's successors as having one fewer preds.
575 ///
576 /// When a chain is being merged into the "placed" chain, this routine will
577 /// quickly walk the successors of each block in the chain and mark them as
578 /// having one fewer active predecessor. It also adds any successors of this
579 /// chain which reach the zero-predecessor state to the appropriate worklist.
580 void MachineBlockPlacement::markChainSuccessors(
581  const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
582  const BlockFilterSet *BlockFilter) {
583  // Walk all the blocks in this chain, marking their successors as having
584  // a predecessor placed.
585  for (MachineBasicBlock *MBB : Chain) {
586  markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
587  }
588 }
589 
590 /// Mark a single block's successors as having one fewer preds.
591 ///
592 /// Under normal circumstances, this is only called by markChainSuccessors,
593 /// but if a block that was to be placed is completely tail-duplicated away,
594 /// and was duplicated into the chain end, we need to redo markBlockSuccessors
595 /// for just that block.
596 void MachineBlockPlacement::markBlockSuccessors(
597  const BlockChain &Chain, const MachineBasicBlock *MBB,
598  const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
599  // Add any successors for which this is the only un-placed in-loop
600  // predecessor to the worklist as a viable candidate for CFG-neutral
601  // placement. No subsequent placement of this block will violate the CFG
602  // shape, so we get to use heuristics to choose a favorable placement.
603  for (MachineBasicBlock *Succ : MBB->successors()) {
604  if (BlockFilter && !BlockFilter->count(Succ))
605  continue;
606  BlockChain &SuccChain = *BlockToChain[Succ];
607  // Disregard edges within a fixed chain, or edges to the loop header.
608  if (&Chain == &SuccChain || Succ == LoopHeaderBB)
609  continue;
610 
611  // This is a cross-chain edge that is within the loop, so decrement the
612  // loop predecessor count of the destination chain.
613  if (SuccChain.UnscheduledPredecessors == 0 ||
614  --SuccChain.UnscheduledPredecessors > 0)
615  continue;
616 
617  auto *NewBB = *SuccChain.begin();
618  if (NewBB->isEHPad())
619  EHPadWorkList.push_back(NewBB);
620  else
621  BlockWorkList.push_back(NewBB);
622  }
623 }
624 
625 /// This helper function collects the set of successors of block
626 /// \p BB that are allowed to be its layout successors, and return
627 /// the total branch probability of edges from \p BB to those
628 /// blocks.
629 BranchProbability MachineBlockPlacement::collectViableSuccessors(
630  const MachineBasicBlock *BB, const BlockChain &Chain,
631  const BlockFilterSet *BlockFilter,
633  // Adjust edge probabilities by excluding edges pointing to blocks that is
634  // either not in BlockFilter or is already in the current chain. Consider the
635  // following CFG:
636  //
637  // --->A
638  // | / \
639  // | B C
640  // | \ / \
641  // ----D E
642  //
643  // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
644  // A->C is chosen as a fall-through, D won't be selected as a successor of C
645  // due to CFG constraint (the probability of C->D is not greater than
646  // HotProb to break topo-order). If we exclude E that is not in BlockFilter
647  // when calculating the probability of C->D, D will be selected and we
648  // will get A C D B as the layout of this loop.
649  auto AdjustedSumProb = BranchProbability::getOne();
650  for (MachineBasicBlock *Succ : BB->successors()) {
651  bool SkipSucc = false;
652  if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
653  SkipSucc = true;
654  } else {
655  BlockChain *SuccChain = BlockToChain[Succ];
656  if (SuccChain == &Chain) {
657  SkipSucc = true;
658  } else if (Succ != *SuccChain->begin()) {
659  LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
660  << " -> Mid chain!\n");
661  continue;
662  }
663  }
664  if (SkipSucc)
665  AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
666  else
667  Successors.push_back(Succ);
668  }
669 
670  return AdjustedSumProb;
671 }
672 
673 /// The helper function returns the branch probability that is adjusted
674 /// or normalized over the new total \p AdjustedSumProb.
675 static BranchProbability
677  BranchProbability AdjustedSumProb) {
678  BranchProbability SuccProb;
679  uint32_t SuccProbN = OrigProb.getNumerator();
680  uint32_t SuccProbD = AdjustedSumProb.getNumerator();
681  if (SuccProbN >= SuccProbD)
682  SuccProb = BranchProbability::getOne();
683  else
684  SuccProb = BranchProbability(SuccProbN, SuccProbD);
685 
686  return SuccProb;
687 }
688 
689 /// Check if \p BB has exactly the successors in \p Successors.
690 static bool
693  if (BB.succ_size() != Successors.size())
694  return false;
695  // We don't want to count self-loops
696  if (Successors.count(&BB))
697  return false;
698  for (MachineBasicBlock *Succ : BB.successors())
699  if (!Successors.count(Succ))
700  return false;
701  return true;
702 }
703 
704 /// Check if a block should be tail duplicated to increase fallthrough
705 /// opportunities.
706 /// \p BB Block to check.
707 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
708  // Blocks with single successors don't create additional fallthrough
709  // opportunities. Don't duplicate them. TODO: When conditional exits are
710  // analyzable, allow them to be duplicated.
711  bool IsSimple = TailDup.isSimpleBB(BB);
712 
713  if (BB->succ_size() == 1)
714  return false;
715  return TailDup.shouldTailDuplicate(IsSimple, *BB);
716 }
717 
718 /// Compare 2 BlockFrequency's with a small penalty for \p A.
719 /// In order to be conservative, we apply a X% penalty to account for
720 /// increased icache pressure and static heuristics. For small frequencies
721 /// we use only the numerators to improve accuracy. For simplicity, we assume the
722 /// penalty is less than 100%
723 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
725  uint64_t EntryFreq) {
726  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
727  BlockFrequency Gain = A - B;
728  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
729 }
730 
731 /// Check the edge frequencies to see if tail duplication will increase
732 /// fallthroughs. It only makes sense to call this function when
733 /// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
734 /// always locally profitable if we would have picked \p Succ without
735 /// considering duplication.
736 bool MachineBlockPlacement::isProfitableToTailDup(
737  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
738  BranchProbability QProb,
739  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
740  // We need to do a probability calculation to make sure this is profitable.
741  // First: does succ have a successor that post-dominates? This affects the
742  // calculation. The 2 relevant cases are:
743  // BB BB
744  // | \Qout | \Qout
745  // P| C |P C
746  // = C' = C'
747  // | /Qin | /Qin
748  // | / | /
749  // Succ Succ
750  // / \ | \ V
751  // U/ =V |U \
752  // / \ = D
753  // D E | /
754  // | /
755  // |/
756  // PDom
757  // '=' : Branch taken for that CFG edge
758  // In the second case, Placing Succ while duplicating it into C prevents the
759  // fallthrough of Succ into either D or PDom, because they now have C as an
760  // unplaced predecessor
761 
762  // Start by figuring out which case we fall into
763  MachineBasicBlock *PDom = nullptr;
765  // Only scan the relevant successors
766  auto AdjustedSuccSumProb =
767  collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
768  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
769  auto BBFreq = MBFI->getBlockFreq(BB);
770  auto SuccFreq = MBFI->getBlockFreq(Succ);
771  BlockFrequency P = BBFreq * PProb;
772  BlockFrequency Qout = BBFreq * QProb;
773  uint64_t EntryFreq = MBFI->getEntryFreq();
774  // If there are no more successors, it is profitable to copy, as it strictly
775  // increases fallthrough.
776  if (SuccSuccs.size() == 0)
777  return greaterWithBias(P, Qout, EntryFreq);
778 
779  auto BestSuccSucc = BranchProbability::getZero();
780  // Find the PDom or the best Succ if no PDom exists.
781  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
782  auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
783  if (Prob > BestSuccSucc)
784  BestSuccSucc = Prob;
785  if (PDom == nullptr)
786  if (MPDT->dominates(SuccSucc, Succ)) {
787  PDom = SuccSucc;
788  break;
789  }
790  }
791  // For the comparisons, we need to know Succ's best incoming edge that isn't
792  // from BB.
793  auto SuccBestPred = BlockFrequency(0);
794  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
795  if (SuccPred == Succ || SuccPred == BB
796  || BlockToChain[SuccPred] == &Chain
797  || (BlockFilter && !BlockFilter->count(SuccPred)))
798  continue;
799  auto Freq = MBFI->getBlockFreq(SuccPred)
800  * MBPI->getEdgeProbability(SuccPred, Succ);
801  if (Freq > SuccBestPred)
802  SuccBestPred = Freq;
803  }
804  // Qin is Succ's best unplaced incoming edge that isn't BB
805  BlockFrequency Qin = SuccBestPred;
806  // If it doesn't have a post-dominating successor, here is the calculation:
807  // BB BB
808  // | \Qout | \
809  // P| C | =
810  // = C' | C
811  // | /Qin | |
812  // | / | C' (+Succ)
813  // Succ Succ /|
814  // / \ | \/ |
815  // U/ =V | == |
816  // / \ | / \|
817  // D E D E
818  // '=' : Branch taken for that CFG edge
819  // Cost in the first case is: P + V
820  // For this calculation, we always assume P > Qout. If Qout > P
821  // The result of this function will be ignored at the caller.
822  // Let F = SuccFreq - Qin
823  // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
824 
825  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
826  BranchProbability UProb = BestSuccSucc;
827  BranchProbability VProb = AdjustedSuccSumProb - UProb;
828  BlockFrequency F = SuccFreq - Qin;
829  BlockFrequency V = SuccFreq * VProb;
830  BlockFrequency QinU = std::min(Qin, F) * UProb;
831  BlockFrequency BaseCost = P + V;
832  BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
833  return greaterWithBias(BaseCost, DupCost, EntryFreq);
834  }
835  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
836  BranchProbability VProb = AdjustedSuccSumProb - UProb;
837  BlockFrequency U = SuccFreq * UProb;
838  BlockFrequency V = SuccFreq * VProb;
839  BlockFrequency F = SuccFreq - Qin;
840  // If there is a post-dominating successor, here is the calculation:
841  // BB BB BB BB
842  // | \Qout | \ | \Qout | \
843  // |P C | = |P C | =
844  // = C' |P C = C' |P C
845  // | /Qin | | | /Qin | |
846  // | / | C' (+Succ) | / | C' (+Succ)
847  // Succ Succ /| Succ Succ /|
848  // | \ V | \/ | | \ V | \/ |
849  // |U \ |U /\ =? |U = |U /\ |
850  // = D = = =?| | D | = =|
851  // | / |/ D | / |/ D
852  // | / | / | = | /
853  // |/ | / |/ | =
854  // Dom Dom Dom Dom
855  // '=' : Branch taken for that CFG edge
856  // The cost for taken branches in the first case is P + U
857  // Let F = SuccFreq - Qin
858  // The cost in the second case (assuming independence), given the layout:
859  // BB, Succ, (C+Succ), D, Dom or the layout:
860  // BB, Succ, D, Dom, (C+Succ)
861  // is Qout + max(F, Qin) * U + min(F, Qin)
862  // compare P + U vs Qout + P * U + Qin.
863  //
864  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
865  //
866  // For the 3rd case, the cost is P + 2 * V
867  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
868  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
869  if (UProb > AdjustedSuccSumProb / 2 &&
870  !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
871  Chain, BlockFilter))
872  // Cases 3 & 4
873  return greaterWithBias(
874  (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
875  EntryFreq);
876  // Cases 1 & 2
877  return greaterWithBias((P + U),
878  (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
879  std::max(Qin, F) * UProb),
880  EntryFreq);
881 }
882 
883 /// Check for a trellis layout. \p BB is the upper part of a trellis if its
884 /// successors form the lower part of a trellis. A successor set S forms the
885 /// lower part of a trellis if all of the predecessors of S are either in S or
886 /// have all of S as successors. We ignore trellises where BB doesn't have 2
887 /// successors because for fewer than 2, it's trivial, and for 3 or greater they
888 /// are very uncommon and complex to compute optimally. Allowing edges within S
889 /// is not strictly a trellis, but the same algorithm works, so we allow it.
890 bool MachineBlockPlacement::isTrellis(
891  const MachineBasicBlock *BB,
892  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
893  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
894  // Technically BB could form a trellis with branching factor higher than 2.
895  // But that's extremely uncommon.
896  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
897  return false;
898 
900  BB->succ_end());
901  // To avoid reviewing the same predecessors twice.
903 
904  for (MachineBasicBlock *Succ : ViableSuccs) {
905  int PredCount = 0;
906  for (auto SuccPred : Succ->predecessors()) {
907  // Allow triangle successors, but don't count them.
908  if (Successors.count(SuccPred)) {
909  // Make sure that it is actually a triangle.
910  for (MachineBasicBlock *CheckSucc : SuccPred->successors())
911  if (!Successors.count(CheckSucc))
912  return false;
913  continue;
914  }
915  const BlockChain *PredChain = BlockToChain[SuccPred];
916  if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
917  PredChain == &Chain || PredChain == BlockToChain[Succ])
918  continue;
919  ++PredCount;
920  // Perform the successor check only once.
921  if (!SeenPreds.insert(SuccPred).second)
922  continue;
923  if (!hasSameSuccessors(*SuccPred, Successors))
924  return false;
925  }
926  // If one of the successors has only BB as a predecessor, it is not a
927  // trellis.
928  if (PredCount < 1)
929  return false;
930  }
931  return true;
932 }
933 
934 /// Pick the highest total weight pair of edges that can both be laid out.
935 /// The edges in \p Edges[0] are assumed to have a different destination than
936 /// the edges in \p Edges[1]. Simple counting shows that the best pair is either
937 /// the individual highest weight edges to the 2 different destinations, or in
938 /// case of a conflict, one of them should be replaced with a 2nd best edge.
939 std::pair<MachineBlockPlacement::WeightedEdge,
940  MachineBlockPlacement::WeightedEdge>
941 MachineBlockPlacement::getBestNonConflictingEdges(
942  const MachineBasicBlock *BB,
944  Edges) {
945  // Sort the edges, and then for each successor, find the best incoming
946  // predecessor. If the best incoming predecessors aren't the same,
947  // then that is clearly the best layout. If there is a conflict, one of the
948  // successors will have to fallthrough from the second best predecessor. We
949  // compare which combination is better overall.
950 
951  // Sort for highest frequency.
952  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
953 
954  llvm::stable_sort(Edges[0], Cmp);
955  llvm::stable_sort(Edges[1], Cmp);
956  auto BestA = Edges[0].begin();
957  auto BestB = Edges[1].begin();
958  // Arrange for the correct answer to be in BestA and BestB
959  // If the 2 best edges don't conflict, the answer is already there.
960  if (BestA->Src == BestB->Src) {
961  // Compare the total fallthrough of (Best + Second Best) for both pairs
962  auto SecondBestA = std::next(BestA);
963  auto SecondBestB = std::next(BestB);
964  BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
965  BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
966  if (BestAScore < BestBScore)
967  BestA = SecondBestA;
968  else
969  BestB = SecondBestB;
970  }
971  // Arrange for the BB edge to be in BestA if it exists.
972  if (BestB->Src == BB)
973  std::swap(BestA, BestB);
974  return std::make_pair(*BestA, *BestB);
975 }
976 
977 /// Get the best successor from \p BB based on \p BB being part of a trellis.
978 /// We only handle trellises with 2 successors, so the algorithm is
979 /// straightforward: Find the best pair of edges that don't conflict. We find
980 /// the best incoming edge for each successor in the trellis. If those conflict,
981 /// we consider which of them should be replaced with the second best.
982 /// Upon return the two best edges will be in \p BestEdges. If one of the edges
983 /// comes from \p BB, it will be in \p BestEdges[0]
984 MachineBlockPlacement::BlockAndTailDupResult
985 MachineBlockPlacement::getBestTrellisSuccessor(
986  const MachineBasicBlock *BB,
987  const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
988  BranchProbability AdjustedSumProb, const BlockChain &Chain,
989  const BlockFilterSet *BlockFilter) {
990 
991  BlockAndTailDupResult Result = {nullptr, false};
993  BB->succ_end());
994 
995  // We assume size 2 because it's common. For general n, we would have to do
996  // the Hungarian algorithm, but it's not worth the complexity because more
997  // than 2 successors is fairly uncommon, and a trellis even more so.
998  if (Successors.size() != 2 || ViableSuccs.size() != 2)
999  return Result;
1000 
1001  // Collect the edge frequencies of all edges that form the trellis.
1003  int SuccIndex = 0;
1004  for (auto Succ : ViableSuccs) {
1005  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1006  // Skip any placed predecessors that are not BB
1007  if (SuccPred != BB)
1008  if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
1009  BlockToChain[SuccPred] == &Chain ||
1010  BlockToChain[SuccPred] == BlockToChain[Succ])
1011  continue;
1012  BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1013  MBPI->getEdgeProbability(SuccPred, Succ);
1014  Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1015  }
1016  ++SuccIndex;
1017  }
1018 
1019  // Pick the best combination of 2 edges from all the edges in the trellis.
1020  WeightedEdge BestA, BestB;
1021  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1022 
1023  if (BestA.Src != BB) {
1024  // If we have a trellis, and BB doesn't have the best fallthrough edges,
1025  // we shouldn't choose any successor. We've already looked and there's a
1026  // better fallthrough edge for all the successors.
1027  LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1028  return Result;
1029  }
1030 
1031  // Did we pick the triangle edge? If tail-duplication is profitable, do
1032  // that instead. Otherwise merge the triangle edge now while we know it is
1033  // optimal.
1034  if (BestA.Dest == BestB.Src) {
1035  // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1036  // would be better.
1037  MachineBasicBlock *Succ1 = BestA.Dest;
1038  MachineBasicBlock *Succ2 = BestB.Dest;
1039  // Check to see if tail-duplication would be profitable.
1040  if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
1041  canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1042  isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1043  Chain, BlockFilter)) {
1045  MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1046  dbgs() << " Selected: " << getBlockName(Succ2)
1047  << ", probability: " << Succ2Prob
1048  << " (Tail Duplicate)\n");
1049  Result.BB = Succ2;
1050  Result.ShouldTailDup = true;
1051  return Result;
1052  }
1053  }
1054  // We have already computed the optimal edge for the other side of the
1055  // trellis.
1056  ComputedEdges[BestB.Src] = { BestB.Dest, false };
1057 
1058  auto TrellisSucc = BestA.Dest;
1060  MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1061  dbgs() << " Selected: " << getBlockName(TrellisSucc)
1062  << ", probability: " << SuccProb << " (Trellis)\n");
1063  Result.BB = TrellisSucc;
1064  return Result;
1065 }
1066 
1067 /// When the option allowTailDupPlacement() is on, this method checks if the
1068 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1069 /// into all of its unplaced, unfiltered predecessors, that are not BB.
1070 bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1071  const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1072  const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1073  if (!shouldTailDuplicate(Succ))
1074  return false;
1075 
1076  // For CFG checking.
1078  BB->succ_end());
1079  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1080  // Make sure all unplaced and unfiltered predecessors can be
1081  // tail-duplicated into.
1082  // Skip any blocks that are already placed or not in this loop.
1083  if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
1084  || BlockToChain[Pred] == &Chain)
1085  continue;
1086  if (!TailDup.canTailDuplicate(Succ, Pred)) {
1087  if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1088  // This will result in a trellis after tail duplication, so we don't
1089  // need to copy Succ into this predecessor. In the presence
1090  // of a trellis tail duplication can continue to be profitable.
1091  // For example:
1092  // A A
1093  // |\ |\
1094  // | \ | \
1095  // | C | C+BB
1096  // | / | |
1097  // |/ | |
1098  // BB => BB |
1099  // |\ |\/|
1100  // | \ |/\|
1101  // | D | D
1102  // | / | /
1103  // |/ |/
1104  // Succ Succ
1105  //
1106  // After BB was duplicated into C, the layout looks like the one on the
1107  // right. BB and C now have the same successors. When considering
1108  // whether Succ can be duplicated into all its unplaced predecessors, we
1109  // ignore C.
1110  // We can do this because C already has a profitable fallthrough, namely
1111  // D. TODO(iteratee): ignore sufficiently cold predecessors for
1112  // duplication and for this test.
1113  //
1114  // This allows trellises to be laid out in 2 separate chains
1115  // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1116  // because it allows the creation of 2 fallthrough paths with links
1117  // between them, and we correctly identify the best layout for these
1118  // CFGs. We want to extend trellises that the user created in addition
1119  // to trellises created by tail-duplication, so we just look for the
1120  // CFG.
1121  continue;
1122  return false;
1123  }
1124  }
1125  return true;
1126 }
1127 
1128 /// Find chains of triangles where we believe it would be profitable to
1129 /// tail-duplicate them all, but a local analysis would not find them.
1130 /// There are 3 ways this can be profitable:
1131 /// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1132 /// longer chains)
1133 /// 2) The chains are statically correlated. Branch probabilities have a very
1134 /// U-shaped distribution.
1135 /// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1136 /// If the branches in a chain are likely to be from the same side of the
1137 /// distribution as their predecessor, but are independent at runtime, this
1138 /// transformation is profitable. (Because the cost of being wrong is a small
1139 /// fixed cost, unlike the standard triangle layout where the cost of being
1140 /// wrong scales with the # of triangles.)
1141 /// 3) The chains are dynamically correlated. If the probability that a previous
1142 /// branch was taken positively influences whether the next branch will be
1143 /// taken
1144 /// We believe that 2 and 3 are common enough to justify the small margin in 1.
1145 void MachineBlockPlacement::precomputeTriangleChains() {
1146  struct TriangleChain {
1147  std::vector<MachineBasicBlock *> Edges;
1148 
1149  TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1150  : Edges({src, dst}) {}
1151 
1152  void append(MachineBasicBlock *dst) {
1153  assert(getKey()->isSuccessor(dst) &&
1154  "Attempting to append a block that is not a successor.");
1155  Edges.push_back(dst);
1156  }
1157 
1158  unsigned count() const { return Edges.size() - 1; }
1159 
1160  MachineBasicBlock *getKey() const {
1161  return Edges.back();
1162  }
1163  };
1164 
1165  if (TriangleChainCount == 0)
1166  return;
1167 
1168  LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1169  // Map from last block to the chain that contains it. This allows us to extend
1170  // chains as we find new triangles.
1172  for (MachineBasicBlock &BB : *F) {
1173  // If BB doesn't have 2 successors, it doesn't start a triangle.
1174  if (BB.succ_size() != 2)
1175  continue;
1176  MachineBasicBlock *PDom = nullptr;
1177  for (MachineBasicBlock *Succ : BB.successors()) {
1178  if (!MPDT->dominates(Succ, &BB))
1179  continue;
1180  PDom = Succ;
1181  break;
1182  }
1183  // If BB doesn't have a post-dominating successor, it doesn't form a
1184  // triangle.
1185  if (PDom == nullptr)
1186  continue;
1187  // If PDom has a hint that it is low probability, skip this triangle.
1188  if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1189  continue;
1190  // If PDom isn't eligible for duplication, this isn't the kind of triangle
1191  // we're looking for.
1192  if (!shouldTailDuplicate(PDom))
1193  continue;
1194  bool CanTailDuplicate = true;
1195  // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1196  // isn't the kind of triangle we're looking for.
1197  for (MachineBasicBlock* Pred : PDom->predecessors()) {
1198  if (Pred == &BB)
1199  continue;
1200  if (!TailDup.canTailDuplicate(PDom, Pred)) {
1201  CanTailDuplicate = false;
1202  break;
1203  }
1204  }
1205  // If we can't tail-duplicate PDom to its predecessors, then skip this
1206  // triangle.
1207  if (!CanTailDuplicate)
1208  continue;
1209 
1210  // Now we have an interesting triangle. Insert it if it's not part of an
1211  // existing chain.
1212  // Note: This cannot be replaced with a call insert() or emplace() because
1213  // the find key is BB, but the insert/emplace key is PDom.
1214  auto Found = TriangleChainMap.find(&BB);
1215  // If it is, remove the chain from the map, grow it, and put it back in the
1216  // map with the end as the new key.
1217  if (Found != TriangleChainMap.end()) {
1218  TriangleChain Chain = std::move(Found->second);
1219  TriangleChainMap.erase(Found);
1220  Chain.append(PDom);
1221  TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1222  } else {
1223  auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1224  assert(InsertResult.second && "Block seen twice.");
1225  (void)InsertResult;
1226  }
1227  }
1228 
1229  // Iterating over a DenseMap is safe here, because the only thing in the body
1230  // of the loop is inserting into another DenseMap (ComputedEdges).
1231  // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1232  for (auto &ChainPair : TriangleChainMap) {
1233  TriangleChain &Chain = ChainPair.second;
1234  // Benchmarking has shown that due to branch correlation duplicating 2 or
1235  // more triangles is profitable, despite the calculations assuming
1236  // independence.
1237  if (Chain.count() < TriangleChainCount)
1238  continue;
1239  MachineBasicBlock *dst = Chain.Edges.back();
1240  Chain.Edges.pop_back();
1241  for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1242  LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1243  << getBlockName(dst)
1244  << " as pre-computed based on triangles.\n");
1245 
1246  auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1247  assert(InsertResult.second && "Block seen twice.");
1248  (void)InsertResult;
1249 
1250  dst = src;
1251  }
1252  }
1253 }
1254 
1255 // When profile is not present, return the StaticLikelyProb.
1256 // When profile is available, we need to handle the triangle-shape CFG.
1258  const MachineBasicBlock *BB) {
1259  if (!BB->getParent()->getFunction().hasProfileData())
1260  return BranchProbability(StaticLikelyProb, 100);
1261  if (BB->succ_size() == 2) {
1262  const MachineBasicBlock *Succ1 = *BB->succ_begin();
1263  const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1264  if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1265  /* See case 1 below for the cost analysis. For BB->Succ to
1266  * be taken with smaller cost, the following needs to hold:
1267  * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1268  * So the threshold T in the calculation below
1269  * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1270  * So T / (1 - T) = 2, Yielding T = 2/3
1271  * Also adding user specified branch bias, we have
1272  * T = (2/3)*(ProfileLikelyProb/50)
1273  * = (2*ProfileLikelyProb)/150)
1274  */
1275  return BranchProbability(2 * ProfileLikelyProb, 150);
1276  }
1277  }
1278  return BranchProbability(ProfileLikelyProb, 100);
1279 }
1280 
1281 /// Checks to see if the layout candidate block \p Succ has a better layout
1282 /// predecessor than \c BB. If yes, returns true.
1283 /// \p SuccProb: The probability adjusted for only remaining blocks.
1284 /// Only used for logging
1285 /// \p RealSuccProb: The un-adjusted probability.
1286 /// \p Chain: The chain that BB belongs to and Succ is being considered for.
1287 /// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1288 /// considered
1289 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1290  const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1291  const BlockChain &SuccChain, BranchProbability SuccProb,
1292  BranchProbability RealSuccProb, const BlockChain &Chain,
1293  const BlockFilterSet *BlockFilter) {
1294 
1295  // There isn't a better layout when there are no unscheduled predecessors.
1296  if (SuccChain.UnscheduledPredecessors == 0)
1297  return false;
1298 
1299  // There are two basic scenarios here:
1300  // -------------------------------------
1301  // Case 1: triangular shape CFG (if-then):
1302  // BB
1303  // | \
1304  // | \
1305  // | Pred
1306  // | /
1307  // Succ
1308  // In this case, we are evaluating whether to select edge -> Succ, e.g.
1309  // set Succ as the layout successor of BB. Picking Succ as BB's
1310  // successor breaks the CFG constraints (FIXME: define these constraints).
1311  // With this layout, Pred BB
1312  // is forced to be outlined, so the overall cost will be cost of the
1313  // branch taken from BB to Pred, plus the cost of back taken branch
1314  // from Pred to Succ, as well as the additional cost associated
1315  // with the needed unconditional jump instruction from Pred To Succ.
1316 
1317  // The cost of the topological order layout is the taken branch cost
1318  // from BB to Succ, so to make BB->Succ a viable candidate, the following
1319  // must hold:
1320  // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1321  // < freq(BB->Succ) * taken_branch_cost.
1322  // Ignoring unconditional jump cost, we get
1323  // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1324  // prob(BB->Succ) > 2 * prob(BB->Pred)
1325  //
1326  // When real profile data is available, we can precisely compute the
1327  // probability threshold that is needed for edge BB->Succ to be considered.
1328  // Without profile data, the heuristic requires the branch bias to be
1329  // a lot larger to make sure the signal is very strong (e.g. 80% default).
1330  // -----------------------------------------------------------------
1331  // Case 2: diamond like CFG (if-then-else):
1332  // S
1333  // / \
1334  // | \
1335  // BB Pred
1336  // \ /
1337  // Succ
1338  // ..
1339  //
1340  // The current block is BB and edge BB->Succ is now being evaluated.
1341  // Note that edge S->BB was previously already selected because
1342  // prob(S->BB) > prob(S->Pred).
1343  // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1344  // choose Pred, we will have a topological ordering as shown on the left
1345  // in the picture below. If we choose Succ, we have the solution as shown
1346  // on the right:
1347  //
1348  // topo-order:
1349  //
1350  // S----- ---S
1351  // | | | |
1352  // ---BB | | BB
1353  // | | | |
1354  // | Pred-- | Succ--
1355  // | | | |
1356  // ---Succ ---Pred--
1357  //
1358  // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1359  // = freq(S->Pred) + freq(S->BB)
1360  //
1361  // If we have profile data (i.e, branch probabilities can be trusted), the
1362  // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1363  // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1364  // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1365  // means the cost of topological order is greater.
1366  // When profile data is not available, however, we need to be more
1367  // conservative. If the branch prediction is wrong, breaking the topo-order
1368  // will actually yield a layout with large cost. For this reason, we need
1369  // strong biased branch at block S with Prob(S->BB) in order to select
1370  // BB->Succ. This is equivalent to looking the CFG backward with backward
1371  // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1372  // profile data).
1373  // --------------------------------------------------------------------------
1374  // Case 3: forked diamond
1375  // S
1376  // / \
1377  // / \
1378  // BB Pred
1379  // | \ / |
1380  // | \ / |
1381  // | X |
1382  // | / \ |
1383  // | / \ |
1384  // S1 S2
1385  //
1386  // The current block is BB and edge BB->S1 is now being evaluated.
1387  // As above S->BB was already selected because
1388  // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1389  //
1390  // topo-order:
1391  //
1392  // S-------| ---S
1393  // | | | |
1394  // ---BB | | BB
1395  // | | | |
1396  // | Pred----| | S1----
1397  // | | | |
1398  // --(S1 or S2) ---Pred--
1399  // |
1400  // S2
1401  //
1402  // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1403  // + min(freq(Pred->S1), freq(Pred->S2))
1404  // Non-topo-order cost:
1405  // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1406  // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1407  // is 0. Then the non topo layout is better when
1408  // freq(S->Pred) < freq(BB->S1).
1409  // This is exactly what is checked below.
1410  // Note there are other shapes that apply (Pred may not be a single block,
1411  // but they all fit this general pattern.)
1413 
1414  // Make sure that a hot successor doesn't have a globally more
1415  // important predecessor.
1416  BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1417  bool BadCFGConflict = false;
1418 
1419  for (MachineBasicBlock *Pred : Succ->predecessors()) {
1420  if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
1421  (BlockFilter && !BlockFilter->count(Pred)) ||
1422  BlockToChain[Pred] == &Chain ||
1423  // This check is redundant except for look ahead. This function is
1424  // called for lookahead by isProfitableToTailDup when BB hasn't been
1425  // placed yet.
1426  (Pred == BB))
1427  continue;
1428  // Do backward checking.
1429  // For all cases above, we need a backward checking to filter out edges that
1430  // are not 'strongly' biased.
1431  // BB Pred
1432  // \ /
1433  // Succ
1434  // We select edge BB->Succ if
1435  // freq(BB->Succ) > freq(Succ) * HotProb
1436  // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1437  // HotProb
1438  // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1439  // Case 1 is covered too, because the first equation reduces to:
1440  // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1441  BlockFrequency PredEdgeFreq =
1442  MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1443  if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1444  BadCFGConflict = true;
1445  break;
1446  }
1447  }
1448 
1449  if (BadCFGConflict) {
1450  LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1451  << SuccProb << " (prob) (non-cold CFG conflict)\n");
1452  return true;
1453  }
1454 
1455  return false;
1456 }
1457 
1458 /// Select the best successor for a block.
1459 ///
1460 /// This looks across all successors of a particular block and attempts to
1461 /// select the "best" one to be the layout successor. It only considers direct
1462 /// successors which also pass the block filter. It will attempt to avoid
1463 /// breaking CFG structure, but cave and break such structures in the case of
1464 /// very hot successor edges.
1465 ///
1466 /// \returns The best successor block found, or null if none are viable, along
1467 /// with a boolean indicating if tail duplication is necessary.
1468 MachineBlockPlacement::BlockAndTailDupResult
1469 MachineBlockPlacement::selectBestSuccessor(
1470  const MachineBasicBlock *BB, const BlockChain &Chain,
1471  const BlockFilterSet *BlockFilter) {
1472  const BranchProbability HotProb(StaticLikelyProb, 100);
1473 
1474  BlockAndTailDupResult BestSucc = { nullptr, false };
1475  auto BestProb = BranchProbability::getZero();
1476 
1478  auto AdjustedSumProb =
1479  collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1480 
1481  LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1482  << "\n");
1483 
1484  // if we already precomputed the best successor for BB, return that if still
1485  // applicable.
1486  auto FoundEdge = ComputedEdges.find(BB);
1487  if (FoundEdge != ComputedEdges.end()) {
1488  MachineBasicBlock *Succ = FoundEdge->second.BB;
1489  ComputedEdges.erase(FoundEdge);
1490  BlockChain *SuccChain = BlockToChain[Succ];
1491  if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1492  SuccChain != &Chain && Succ == *SuccChain->begin())
1493  return FoundEdge->second;
1494  }
1495 
1496  // if BB is part of a trellis, Use the trellis to determine the optimal
1497  // fallthrough edges
1498  if (isTrellis(BB, Successors, Chain, BlockFilter))
1499  return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1500  BlockFilter);
1501 
1502  // For blocks with CFG violations, we may be able to lay them out anyway with
1503  // tail-duplication. We keep this vector so we can perform the probability
1504  // calculations the minimum number of times.
1506  DupCandidates;
1507  for (MachineBasicBlock *Succ : Successors) {
1508  auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1509  BranchProbability SuccProb =
1510  getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1511 
1512  BlockChain &SuccChain = *BlockToChain[Succ];
1513  // Skip the edge \c BB->Succ if block \c Succ has a better layout
1514  // predecessor that yields lower global cost.
1515  if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1516  Chain, BlockFilter)) {
1517  // If tail duplication would make Succ profitable, place it.
1518  if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1519  DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
1520  continue;
1521  }
1522 
1523  LLVM_DEBUG(
1524  dbgs() << " Candidate: " << getBlockName(Succ)
1525  << ", probability: " << SuccProb
1526  << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1527  << "\n");
1528 
1529  if (BestSucc.BB && BestProb >= SuccProb) {
1530  LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1531  continue;
1532  }
1533 
1534  LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1535  BestSucc.BB = Succ;
1536  BestProb = SuccProb;
1537  }
1538  // Handle the tail duplication candidates in order of decreasing probability.
1539  // Stop at the first one that is profitable. Also stop if they are less
1540  // profitable than BestSucc. Position is important because we preserve it and
1541  // prefer first best match. Here we aren't comparing in order, so we capture
1542  // the position instead.
1543  llvm::stable_sort(DupCandidates,
1544  [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1545  std::tuple<BranchProbability, MachineBasicBlock *> R) {
1546  return std::get<0>(L) > std::get<0>(R);
1547  });
1548  for (auto &Tup : DupCandidates) {
1549  BranchProbability DupProb;
1550  MachineBasicBlock *Succ;
1551  std::tie(DupProb, Succ) = Tup;
1552  if (DupProb < BestProb)
1553  break;
1554  if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1555  && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1556  LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1557  << ", probability: " << DupProb
1558  << " (Tail Duplicate)\n");
1559  BestSucc.BB = Succ;
1560  BestSucc.ShouldTailDup = true;
1561  break;
1562  }
1563  }
1564 
1565  if (BestSucc.BB)
1566  LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1567 
1568  return BestSucc;
1569 }
1570 
1571 /// Select the best block from a worklist.
1572 ///
1573 /// This looks through the provided worklist as a list of candidate basic
1574 /// blocks and select the most profitable one to place. The definition of
1575 /// profitable only really makes sense in the context of a loop. This returns
1576 /// the most frequently visited block in the worklist, which in the case of
1577 /// a loop, is the one most desirable to be physically close to the rest of the
1578 /// loop body in order to improve i-cache behavior.
1579 ///
1580 /// \returns The best block found, or null if none are viable.
1581 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1582  const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1583  // Once we need to walk the worklist looking for a candidate, cleanup the
1584  // worklist of already placed entries.
1585  // FIXME: If this shows up on profiles, it could be folded (at the cost of
1586  // some code complexity) into the loop below.
1587  WorkList.erase(llvm::remove_if(WorkList,
1588  [&](MachineBasicBlock *BB) {
1589  return BlockToChain.lookup(BB) == &Chain;
1590  }),
1591  WorkList.end());
1592 
1593  if (WorkList.empty())
1594  return nullptr;
1595 
1596  bool IsEHPad = WorkList[0]->isEHPad();
1597 
1598  MachineBasicBlock *BestBlock = nullptr;
1599  BlockFrequency BestFreq;
1600  for (MachineBasicBlock *MBB : WorkList) {
1601  assert(MBB->isEHPad() == IsEHPad &&
1602  "EHPad mismatch between block and work list.");
1603 
1604  BlockChain &SuccChain = *BlockToChain[MBB];
1605  if (&SuccChain == &Chain)
1606  continue;
1607 
1608  assert(SuccChain.UnscheduledPredecessors == 0 &&
1609  "Found CFG-violating block");
1610 
1611  BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1612  LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> ";
1613  MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
1614 
1615  // For ehpad, we layout the least probable first as to avoid jumping back
1616  // from least probable landingpads to more probable ones.
1617  //
1618  // FIXME: Using probability is probably (!) not the best way to achieve
1619  // this. We should probably have a more principled approach to layout
1620  // cleanup code.
1621  //
1622  // The goal is to get:
1623  //
1624  // +--------------------------+
1625  // | V
1626  // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1627  //
1628  // Rather than:
1629  //
1630  // +-------------------------------------+
1631  // V |
1632  // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1633  if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1634  continue;
1635 
1636  BestBlock = MBB;
1637  BestFreq = CandidateFreq;
1638  }
1639 
1640  return BestBlock;
1641 }
1642 
1643 /// Retrieve the first unplaced basic block.
1644 ///
1645 /// This routine is called when we are unable to use the CFG to walk through
1646 /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1647 /// We walk through the function's blocks in order, starting from the
1648 /// LastUnplacedBlockIt. We update this iterator on each call to avoid
1649 /// re-scanning the entire sequence on repeated calls to this routine.
1650 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1651  const BlockChain &PlacedChain,
1652  MachineFunction::iterator &PrevUnplacedBlockIt,
1653  const BlockFilterSet *BlockFilter) {
1654  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1655  ++I) {
1656  if (BlockFilter && !BlockFilter->count(&*I))
1657  continue;
1658  if (BlockToChain[&*I] != &PlacedChain) {
1659  PrevUnplacedBlockIt = I;
1660  // Now select the head of the chain to which the unplaced block belongs
1661  // as the block to place. This will force the entire chain to be placed,
1662  // and satisfies the requirements of merging chains.
1663  return *BlockToChain[&*I]->begin();
1664  }
1665  }
1666  return nullptr;
1667 }
1668 
1669 void MachineBlockPlacement::fillWorkLists(
1670  const MachineBasicBlock *MBB,
1671  SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1672  const BlockFilterSet *BlockFilter = nullptr) {
1673  BlockChain &Chain = *BlockToChain[MBB];
1674  if (!UpdatedPreds.insert(&Chain).second)
1675  return;
1676 
1677  assert(
1678  Chain.UnscheduledPredecessors == 0 &&
1679  "Attempting to place block with unscheduled predecessors in worklist.");
1680  for (MachineBasicBlock *ChainBB : Chain) {
1681  assert(BlockToChain[ChainBB] == &Chain &&
1682  "Block in chain doesn't match BlockToChain map.");
1683  for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1684  if (BlockFilter && !BlockFilter->count(Pred))
1685  continue;
1686  if (BlockToChain[Pred] == &Chain)
1687  continue;
1688  ++Chain.UnscheduledPredecessors;
1689  }
1690  }
1691 
1692  if (Chain.UnscheduledPredecessors != 0)
1693  return;
1694 
1695  MachineBasicBlock *BB = *Chain.begin();
1696  if (BB->isEHPad())
1697  EHPadWorkList.push_back(BB);
1698  else
1699  BlockWorkList.push_back(BB);
1700 }
1701 
1702 void MachineBlockPlacement::buildChain(
1703  const MachineBasicBlock *HeadBB, BlockChain &Chain,
1704  BlockFilterSet *BlockFilter) {
1705  assert(HeadBB && "BB must not be null.\n");
1706  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1707  MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1708 
1709  const MachineBasicBlock *LoopHeaderBB = HeadBB;
1710  markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1711  MachineBasicBlock *BB = *std::prev(Chain.end());
1712  while (true) {
1713  assert(BB && "null block found at end of chain in loop.");
1714  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1715  assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1716 
1717 
1718  // Look for the best viable successor if there is one to place immediately
1719  // after this block.
1720  auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1721  MachineBasicBlock* BestSucc = Result.BB;
1722  bool ShouldTailDup = Result.ShouldTailDup;
1723  if (allowTailDupPlacement())
1724  ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc));
1725 
1726  // If an immediate successor isn't available, look for the best viable
1727  // block among those we've identified as not violating the loop's CFG at
1728  // this point. This won't be a fallthrough, but it will increase locality.
1729  if (!BestSucc)
1730  BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1731  if (!BestSucc)
1732  BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1733 
1734  if (!BestSucc) {
1735  BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
1736  if (!BestSucc)
1737  break;
1738 
1739  LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1740  "layout successor until the CFG reduces\n");
1741  }
1742 
1743  // Placement may have changed tail duplication opportunities.
1744  // Check for that now.
1745  if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1746  // If the chosen successor was duplicated into all its predecessors,
1747  // don't bother laying it out, just go round the loop again with BB as
1748  // the chain end.
1749  if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1750  BlockFilter, PrevUnplacedBlockIt))
1751  continue;
1752  }
1753 
1754  // Place this block, updating the datastructures to reflect its placement.
1755  BlockChain &SuccChain = *BlockToChain[BestSucc];
1756  // Zero out UnscheduledPredecessors for the successor we're about to merge in case
1757  // we selected a successor that didn't fit naturally into the CFG.
1758  SuccChain.UnscheduledPredecessors = 0;
1759  LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1760  << getBlockName(BestSucc) << "\n");
1761  markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1762  Chain.merge(BestSucc, &SuccChain);
1763  BB = *std::prev(Chain.end());
1764  }
1765 
1766  LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1767  << getBlockName(*Chain.begin()) << "\n");
1768 }
1769 
1770 // If bottom of block BB has only one successor OldTop, in most cases it is
1771 // profitable to move it before OldTop, except the following case:
1772 //
1773 // -->OldTop<-
1774 // | . |
1775 // | . |
1776 // | . |
1777 // ---Pred |
1778 // | |
1779 // BB-----
1780 //
1781 // If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
1782 // layout the other successor below it, so it can't reduce taken branch.
1783 // In this case we keep its original layout.
1784 bool
1785 MachineBlockPlacement::canMoveBottomBlockToTop(
1786  const MachineBasicBlock *BottomBlock,
1787  const MachineBasicBlock *OldTop) {
1788  if (BottomBlock->pred_size() != 1)
1789  return true;
1790  MachineBasicBlock *Pred = *BottomBlock->pred_begin();
1791  if (Pred->succ_size() != 2)
1792  return true;
1793 
1794  MachineBasicBlock *OtherBB = *Pred->succ_begin();
1795  if (OtherBB == BottomBlock)
1796  OtherBB = *Pred->succ_rbegin();
1797  if (OtherBB == OldTop)
1798  return false;
1799 
1800  return true;
1801 }
1802 
1803 // Find out the possible fall through frequence to the top of a loop.
1805 MachineBlockPlacement::TopFallThroughFreq(
1806  const MachineBasicBlock *Top,
1807  const BlockFilterSet &LoopBlockSet) {
1808  BlockFrequency MaxFreq = 0;
1809  for (MachineBasicBlock *Pred : Top->predecessors()) {
1810  BlockChain *PredChain = BlockToChain[Pred];
1811  if (!LoopBlockSet.count(Pred) &&
1812  (!PredChain || Pred == *std::prev(PredChain->end()))) {
1813  // Found a Pred block can be placed before Top.
1814  // Check if Top is the best successor of Pred.
1815  auto TopProb = MBPI->getEdgeProbability(Pred, Top);
1816  bool TopOK = true;
1817  for (MachineBasicBlock *Succ : Pred->successors()) {
1818  auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
1819  BlockChain *SuccChain = BlockToChain[Succ];
1820  // Check if Succ can be placed after Pred.
1821  // Succ should not be in any chain, or it is the head of some chain.
1822  if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
1823  (!SuccChain || Succ == *SuccChain->begin())) {
1824  TopOK = false;
1825  break;
1826  }
1827  }
1828  if (TopOK) {
1829  BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
1830  MBPI->getEdgeProbability(Pred, Top);
1831  if (EdgeFreq > MaxFreq)
1832  MaxFreq = EdgeFreq;
1833  }
1834  }
1835  }
1836  return MaxFreq;
1837 }
1838 
1839 // Compute the fall through gains when move NewTop before OldTop.
1840 //
1841 // In following diagram, edges marked as "-" are reduced fallthrough, edges
1842 // marked as "+" are increased fallthrough, this function computes
1843 //
1844 // SUM(increased fallthrough) - SUM(decreased fallthrough)
1845 //
1846 // |
1847 // | -
1848 // V
1849 // --->OldTop
1850 // | .
1851 // | .
1852 // +| . +
1853 // | Pred --->
1854 // | |-
1855 // | V
1856 // --- NewTop <---
1857 // |-
1858 // V
1859 //
1861 MachineBlockPlacement::FallThroughGains(
1862  const MachineBasicBlock *NewTop,
1863  const MachineBasicBlock *OldTop,
1864  const MachineBasicBlock *ExitBB,
1865  const BlockFilterSet &LoopBlockSet) {
1866  BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
1867  BlockFrequency FallThrough2Exit = 0;
1868  if (ExitBB)
1869  FallThrough2Exit = MBFI->getBlockFreq(NewTop) *
1870  MBPI->getEdgeProbability(NewTop, ExitBB);
1871  BlockFrequency BackEdgeFreq = MBFI->getBlockFreq(NewTop) *
1872  MBPI->getEdgeProbability(NewTop, OldTop);
1873 
1874  // Find the best Pred of NewTop.
1875  MachineBasicBlock *BestPred = nullptr;
1876  BlockFrequency FallThroughFromPred = 0;
1877  for (MachineBasicBlock *Pred : NewTop->predecessors()) {
1878  if (!LoopBlockSet.count(Pred))
1879  continue;
1880  BlockChain *PredChain = BlockToChain[Pred];
1881  if (!PredChain || Pred == *std::prev(PredChain->end())) {
1882  BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
1883  MBPI->getEdgeProbability(Pred, NewTop);
1884  if (EdgeFreq > FallThroughFromPred) {
1885  FallThroughFromPred = EdgeFreq;
1886  BestPred = Pred;
1887  }
1888  }
1889  }
1890 
1891  // If NewTop is not placed after Pred, another successor can be placed
1892  // after Pred.
1893  BlockFrequency NewFreq = 0;
1894  if (BestPred) {
1895  for (MachineBasicBlock *Succ : BestPred->successors()) {
1896  if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
1897  continue;
1898  if (ComputedEdges.find(Succ) != ComputedEdges.end())
1899  continue;
1900  BlockChain *SuccChain = BlockToChain[Succ];
1901  if ((SuccChain && (Succ != *SuccChain->begin())) ||
1902  (SuccChain == BlockToChain[BestPred]))
1903  continue;
1904  BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
1905  MBPI->getEdgeProbability(BestPred, Succ);
1906  if (EdgeFreq > NewFreq)
1907  NewFreq = EdgeFreq;
1908  }
1909  BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
1910  MBPI->getEdgeProbability(BestPred, NewTop);
1911  if (NewFreq > OrigEdgeFreq) {
1912  // If NewTop is not the best successor of Pred, then Pred doesn't
1913  // fallthrough to NewTop. So there is no FallThroughFromPred and
1914  // NewFreq.
1915  NewFreq = 0;
1916  FallThroughFromPred = 0;
1917  }
1918  }
1919 
1920  BlockFrequency Result = 0;
1921  BlockFrequency Gains = BackEdgeFreq + NewFreq;
1922  BlockFrequency Lost = FallThrough2Top + FallThrough2Exit +
1923  FallThroughFromPred;
1924  if (Gains > Lost)
1925  Result = Gains - Lost;
1926  return Result;
1927 }
1928 
1929 /// Helper function of findBestLoopTop. Find the best loop top block
1930 /// from predecessors of old top.
1931 ///
1932 /// Look for a block which is strictly better than the old top for laying
1933 /// out before the old top of the loop. This looks for only two patterns:
1934 ///
1935 /// 1. a block has only one successor, the old loop top
1936 ///
1937 /// Because such a block will always result in an unconditional jump,
1938 /// rotating it in front of the old top is always profitable.
1939 ///
1940 /// 2. a block has two successors, one is old top, another is exit
1941 /// and it has more than one predecessors
1942 ///
1943 /// If it is below one of its predecessors P, only P can fall through to
1944 /// it, all other predecessors need a jump to it, and another conditional
1945 /// jump to loop header. If it is moved before loop header, all its
1946 /// predecessors jump to it, then fall through to loop header. So all its
1947 /// predecessors except P can reduce one taken branch.
1948 /// At the same time, move it before old top increases the taken branch
1949 /// to loop exit block, so the reduced taken branch will be compared with
1950 /// the increased taken branch to the loop exit block.
1952 MachineBlockPlacement::findBestLoopTopHelper(
1953  MachineBasicBlock *OldTop,
1954  const MachineLoop &L,
1955  const BlockFilterSet &LoopBlockSet) {
1956  // Check that the header hasn't been fused with a preheader block due to
1957  // crazy branches. If it has, we need to start with the header at the top to
1958  // prevent pulling the preheader into the loop body.
1959  BlockChain &HeaderChain = *BlockToChain[OldTop];
1960  if (!LoopBlockSet.count(*HeaderChain.begin()))
1961  return OldTop;
1962 
1963  LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
1964  << "\n");
1965 
1966  BlockFrequency BestGains = 0;
1967  MachineBasicBlock *BestPred = nullptr;
1968  for (MachineBasicBlock *Pred : OldTop->predecessors()) {
1969  if (!LoopBlockSet.count(Pred))
1970  continue;
1971  if (Pred == L.getHeader())
1972  continue;
1973  LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
1974  << Pred->succ_size() << " successors, ";
1975  MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
1976  if (Pred->succ_size() > 2)
1977  continue;
1978 
1979  MachineBasicBlock *OtherBB = nullptr;
1980  if (Pred->succ_size() == 2) {
1981  OtherBB = *Pred->succ_begin();
1982  if (OtherBB == OldTop)
1983  OtherBB = *Pred->succ_rbegin();
1984  }
1985 
1986  if (!canMoveBottomBlockToTop(Pred, OldTop))
1987  continue;
1988 
1989  BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
1990  LoopBlockSet);
1991  if ((Gains > 0) && (Gains > BestGains ||
1992  ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
1993  BestPred = Pred;
1994  BestGains = Gains;
1995  }
1996  }
1997 
1998  // If no direct predecessor is fine, just use the loop header.
1999  if (!BestPred) {
2000  LLVM_DEBUG(dbgs() << " final top unchanged\n");
2001  return OldTop;
2002  }
2003 
2004  // Walk backwards through any straight line of predecessors.
2005  while (BestPred->pred_size() == 1 &&
2006  (*BestPred->pred_begin())->succ_size() == 1 &&
2007  *BestPred->pred_begin() != L.getHeader())
2008  BestPred = *BestPred->pred_begin();
2009 
2010  LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2011  return BestPred;
2012 }
2013 
2014 /// Find the best loop top block for layout.
2015 ///
2016 /// This function iteratively calls findBestLoopTopHelper, until no new better
2017 /// BB can be found.
2019 MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2020  const BlockFilterSet &LoopBlockSet) {
2021  // Placing the latch block before the header may introduce an extra branch
2022  // that skips this block the first time the loop is executed, which we want
2023  // to avoid when optimising for size.
2024  // FIXME: in theory there is a case that does not introduce a new branch,
2025  // i.e. when the layout predecessor does not fallthrough to the loop header.
2026  // In practice this never happens though: there always seems to be a preheader
2027  // that can fallthrough and that is also placed before the header.
2028  if (F->getFunction().hasOptSize())
2029  return L.getHeader();
2030 
2031  MachineBasicBlock *OldTop = nullptr;
2032  MachineBasicBlock *NewTop = L.getHeader();
2033  while (NewTop != OldTop) {
2034  OldTop = NewTop;
2035  NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2036  if (NewTop != OldTop)
2037  ComputedEdges[NewTop] = { OldTop, false };
2038  }
2039  return NewTop;
2040 }
2041 
2042 /// Find the best loop exiting block for layout.
2043 ///
2044 /// This routine implements the logic to analyze the loop looking for the best
2045 /// block to layout at the top of the loop. Typically this is done to maximize
2046 /// fallthrough opportunities.
2048 MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2049  const BlockFilterSet &LoopBlockSet,
2050  BlockFrequency &ExitFreq) {
2051  // We don't want to layout the loop linearly in all cases. If the loop header
2052  // is just a normal basic block in the loop, we want to look for what block
2053  // within the loop is the best one to layout at the top. However, if the loop
2054  // header has be pre-merged into a chain due to predecessors not having
2055  // analyzable branches, *and* the predecessor it is merged with is *not* part
2056  // of the loop, rotating the header into the middle of the loop will create
2057  // a non-contiguous range of blocks which is Very Bad. So start with the
2058  // header and only rotate if safe.
2059  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2060  if (!LoopBlockSet.count(*HeaderChain.begin()))
2061  return nullptr;
2062 
2063  BlockFrequency BestExitEdgeFreq;
2064  unsigned BestExitLoopDepth = 0;
2065  MachineBasicBlock *ExitingBB = nullptr;
2066  // If there are exits to outer loops, loop rotation can severely limit
2067  // fallthrough opportunities unless it selects such an exit. Keep a set of
2068  // blocks where rotating to exit with that block will reach an outer loop.
2069  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2070 
2071  LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2072  << getBlockName(L.getHeader()) << "\n");
2073  for (MachineBasicBlock *MBB : L.getBlocks()) {
2074  BlockChain &Chain = *BlockToChain[MBB];
2075  // Ensure that this block is at the end of a chain; otherwise it could be
2076  // mid-way through an inner loop or a successor of an unanalyzable branch.
2077  if (MBB != *std::prev(Chain.end()))
2078  continue;
2079 
2080  // Now walk the successors. We need to establish whether this has a viable
2081  // exiting successor and whether it has a viable non-exiting successor.
2082  // We store the old exiting state and restore it if a viable looping
2083  // successor isn't found.
2084  MachineBasicBlock *OldExitingBB = ExitingBB;
2085  BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2086  bool HasLoopingSucc = false;
2087  for (MachineBasicBlock *Succ : MBB->successors()) {
2088  if (Succ->isEHPad())
2089  continue;
2090  if (Succ == MBB)
2091  continue;
2092  BlockChain &SuccChain = *BlockToChain[Succ];
2093  // Don't split chains, either this chain or the successor's chain.
2094  if (&Chain == &SuccChain) {
2095  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2096  << getBlockName(Succ) << " (chain conflict)\n");
2097  continue;
2098  }
2099 
2100  auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2101  if (LoopBlockSet.count(Succ)) {
2102  LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2103  << getBlockName(Succ) << " (" << SuccProb << ")\n");
2104  HasLoopingSucc = true;
2105  continue;
2106  }
2107 
2108  unsigned SuccLoopDepth = 0;
2109  if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2110  SuccLoopDepth = ExitLoop->getLoopDepth();
2111  if (ExitLoop->contains(&L))
2112  BlocksExitingToOuterLoop.insert(MBB);
2113  }
2114 
2115  BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2116  LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2117  << getBlockName(Succ) << " [L:" << SuccLoopDepth
2118  << "] (";
2119  MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
2120  // Note that we bias this toward an existing layout successor to retain
2121  // incoming order in the absence of better information. The exit must have
2122  // a frequency higher than the current exit before we consider breaking
2123  // the layout.
2124  BranchProbability Bias(100 - ExitBlockBias, 100);
2125  if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2126  ExitEdgeFreq > BestExitEdgeFreq ||
2127  (MBB->isLayoutSuccessor(Succ) &&
2128  !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2129  BestExitEdgeFreq = ExitEdgeFreq;
2130  ExitingBB = MBB;
2131  }
2132  }
2133 
2134  if (!HasLoopingSucc) {
2135  // Restore the old exiting state, no viable looping successor was found.
2136  ExitingBB = OldExitingBB;
2137  BestExitEdgeFreq = OldBestExitEdgeFreq;
2138  }
2139  }
2140  // Without a candidate exiting block or with only a single block in the
2141  // loop, just use the loop header to layout the loop.
2142  if (!ExitingBB) {
2143  LLVM_DEBUG(
2144  dbgs() << " No other candidate exit blocks, using loop header\n");
2145  return nullptr;
2146  }
2147  if (L.getNumBlocks() == 1) {
2148  LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2149  return nullptr;
2150  }
2151 
2152  // Also, if we have exit blocks which lead to outer loops but didn't select
2153  // one of them as the exiting block we are rotating toward, disable loop
2154  // rotation altogether.
2155  if (!BlocksExitingToOuterLoop.empty() &&
2156  !BlocksExitingToOuterLoop.count(ExitingBB))
2157  return nullptr;
2158 
2159  LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2160  << "\n");
2161  ExitFreq = BestExitEdgeFreq;
2162  return ExitingBB;
2163 }
2164 
2165 /// Check if there is a fallthrough to loop header Top.
2166 ///
2167 /// 1. Look for a Pred that can be layout before Top.
2168 /// 2. Check if Top is the most possible successor of Pred.
2169 bool
2170 MachineBlockPlacement::hasViableTopFallthrough(
2171  const MachineBasicBlock *Top,
2172  const BlockFilterSet &LoopBlockSet) {
2173  for (MachineBasicBlock *Pred : Top->predecessors()) {
2174  BlockChain *PredChain = BlockToChain[Pred];
2175  if (!LoopBlockSet.count(Pred) &&
2176  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2177  // Found a Pred block can be placed before Top.
2178  // Check if Top is the best successor of Pred.
2179  auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2180  bool TopOK = true;
2181  for (MachineBasicBlock *Succ : Pred->successors()) {
2182  auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2183  BlockChain *SuccChain = BlockToChain[Succ];
2184  // Check if Succ can be placed after Pred.
2185  // Succ should not be in any chain, or it is the head of some chain.
2186  if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2187  TopOK = false;
2188  break;
2189  }
2190  }
2191  if (TopOK)
2192  return true;
2193  }
2194  }
2195  return false;
2196 }
2197 
2198 /// Attempt to rotate an exiting block to the bottom of the loop.
2199 ///
2200 /// Once we have built a chain, try to rotate it to line up the hot exit block
2201 /// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2202 /// branches. For example, if the loop has fallthrough into its header and out
2203 /// of its bottom already, don't rotate it.
2204 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2205  const MachineBasicBlock *ExitingBB,
2206  BlockFrequency ExitFreq,
2207  const BlockFilterSet &LoopBlockSet) {
2208  if (!ExitingBB)
2209  return;
2210 
2211  MachineBasicBlock *Top = *LoopChain.begin();
2212  MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2213 
2214  // If ExitingBB is already the last one in a chain then nothing to do.
2215  if (Bottom == ExitingBB)
2216  return;
2217 
2218  bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2219 
2220  // If the header has viable fallthrough, check whether the current loop
2221  // bottom is a viable exiting block. If so, bail out as rotating will
2222  // introduce an unnecessary branch.
2223  if (ViableTopFallthrough) {
2224  for (MachineBasicBlock *Succ : Bottom->successors()) {
2225  BlockChain *SuccChain = BlockToChain[Succ];
2226  if (!LoopBlockSet.count(Succ) &&
2227  (!SuccChain || Succ == *SuccChain->begin()))
2228  return;
2229  }
2230 
2231  // Rotate will destroy the top fallthrough, we need to ensure the new exit
2232  // frequency is larger than top fallthrough.
2233  BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2234  if (FallThrough2Top >= ExitFreq)
2235  return;
2236  }
2237 
2238  BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2239  if (ExitIt == LoopChain.end())
2240  return;
2241 
2242  // Rotating a loop exit to the bottom when there is a fallthrough to top
2243  // trades the entry fallthrough for an exit fallthrough.
2244  // If there is no bottom->top edge, but the chosen exit block does have
2245  // a fallthrough, we break that fallthrough for nothing in return.
2246 
2247  // Let's consider an example. We have a built chain of basic blocks
2248  // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2249  // By doing a rotation we get
2250  // Bk+1, ..., Bn, B1, ..., Bk
2251  // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2252  // If we had a fallthrough Bk -> Bk+1 it is broken now.
2253  // It might be compensated by fallthrough Bn -> B1.
2254  // So we have a condition to avoid creation of extra branch by loop rotation.
2255  // All below must be true to avoid loop rotation:
2256  // If there is a fallthrough to top (B1)
2257  // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2258  // There is no fallthrough from bottom (Bn) to top (B1).
2259  // Please note that there is no exit fallthrough from Bn because we checked it
2260  // above.
2261  if (ViableTopFallthrough) {
2262  assert(std::next(ExitIt) != LoopChain.end() &&
2263  "Exit should not be last BB");
2264  MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2265  if (ExitingBB->isSuccessor(NextBlockInChain))
2266  if (!Bottom->isSuccessor(Top))
2267  return;
2268  }
2269 
2270  LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2271  << " at bottom\n");
2272  std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2273 }
2274 
2275 /// Attempt to rotate a loop based on profile data to reduce branch cost.
2276 ///
2277 /// With profile data, we can determine the cost in terms of missed fall through
2278 /// opportunities when rotating a loop chain and select the best rotation.
2279 /// Basically, there are three kinds of cost to consider for each rotation:
2280 /// 1. The possibly missed fall through edge (if it exists) from BB out of
2281 /// the loop to the loop header.
2282 /// 2. The possibly missed fall through edges (if they exist) from the loop
2283 /// exits to BB out of the loop.
2284 /// 3. The missed fall through edge (if it exists) from the last BB to the
2285 /// first BB in the loop chain.
2286 /// Therefore, the cost for a given rotation is the sum of costs listed above.
2287 /// We select the best rotation with the smallest cost.
2288 void MachineBlockPlacement::rotateLoopWithProfile(
2289  BlockChain &LoopChain, const MachineLoop &L,
2290  const BlockFilterSet &LoopBlockSet) {
2291  auto RotationPos = LoopChain.end();
2292 
2293  BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
2294 
2295  // A utility lambda that scales up a block frequency by dividing it by a
2296  // branch probability which is the reciprocal of the scale.
2297  auto ScaleBlockFrequency = [](BlockFrequency Freq,
2298  unsigned Scale) -> BlockFrequency {
2299  if (Scale == 0)
2300  return 0;
2301  // Use operator / between BlockFrequency and BranchProbability to implement
2302  // saturating multiplication.
2303  return Freq / BranchProbability(1, Scale);
2304  };
2305 
2306  // Compute the cost of the missed fall-through edge to the loop header if the
2307  // chain head is not the loop header. As we only consider natural loops with
2308  // single header, this computation can be done only once.
2309  BlockFrequency HeaderFallThroughCost(0);
2310  MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2311  for (auto *Pred : ChainHeaderBB->predecessors()) {
2312  BlockChain *PredChain = BlockToChain[Pred];
2313  if (!LoopBlockSet.count(Pred) &&
2314  (!PredChain || Pred == *std::prev(PredChain->end()))) {
2315  auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2316  MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2317  auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2318  // If the predecessor has only an unconditional jump to the header, we
2319  // need to consider the cost of this jump.
2320  if (Pred->succ_size() == 1)
2321  FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2322  HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2323  }
2324  }
2325 
2326  // Here we collect all exit blocks in the loop, and for each exit we find out
2327  // its hottest exit edge. For each loop rotation, we define the loop exit cost
2328  // as the sum of frequencies of exit edges we collect here, excluding the exit
2329  // edge from the tail of the loop chain.
2331  for (auto BB : LoopChain) {
2332  auto LargestExitEdgeProb = BranchProbability::getZero();
2333  for (auto *Succ : BB->successors()) {
2334  BlockChain *SuccChain = BlockToChain[Succ];
2335  if (!LoopBlockSet.count(Succ) &&
2336  (!SuccChain || Succ == *SuccChain->begin())) {
2337  auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2338  LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2339  }
2340  }
2341  if (LargestExitEdgeProb > BranchProbability::getZero()) {
2342  auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2343  ExitsWithFreq.emplace_back(BB, ExitFreq);
2344  }
2345  }
2346 
2347  // In this loop we iterate every block in the loop chain and calculate the
2348  // cost assuming the block is the head of the loop chain. When the loop ends,
2349  // we should have found the best candidate as the loop chain's head.
2350  for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2351  EndIter = LoopChain.end();
2352  Iter != EndIter; Iter++, TailIter++) {
2353  // TailIter is used to track the tail of the loop chain if the block we are
2354  // checking (pointed by Iter) is the head of the chain.
2355  if (TailIter == LoopChain.end())
2356  TailIter = LoopChain.begin();
2357 
2358  auto TailBB = *TailIter;
2359 
2360  // Calculate the cost by putting this BB to the top.
2361  BlockFrequency Cost = 0;
2362 
2363  // If the current BB is the loop header, we need to take into account the
2364  // cost of the missed fall through edge from outside of the loop to the
2365  // header.
2366  if (Iter != LoopChain.begin())
2367  Cost += HeaderFallThroughCost;
2368 
2369  // Collect the loop exit cost by summing up frequencies of all exit edges
2370  // except the one from the chain tail.
2371  for (auto &ExitWithFreq : ExitsWithFreq)
2372  if (TailBB != ExitWithFreq.first)
2373  Cost += ExitWithFreq.second;
2374 
2375  // The cost of breaking the once fall-through edge from the tail to the top
2376  // of the loop chain. Here we need to consider three cases:
2377  // 1. If the tail node has only one successor, then we will get an
2378  // additional jmp instruction. So the cost here is (MisfetchCost +
2379  // JumpInstCost) * tail node frequency.
2380  // 2. If the tail node has two successors, then we may still get an
2381  // additional jmp instruction if the layout successor after the loop
2382  // chain is not its CFG successor. Note that the more frequently executed
2383  // jmp instruction will be put ahead of the other one. Assume the
2384  // frequency of those two branches are x and y, where x is the frequency
2385  // of the edge to the chain head, then the cost will be
2386  // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2387  // 3. If the tail node has more than two successors (this rarely happens),
2388  // we won't consider any additional cost.
2389  if (TailBB->isSuccessor(*Iter)) {
2390  auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2391  if (TailBB->succ_size() == 1)
2392  Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
2394  else if (TailBB->succ_size() == 2) {
2395  auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2396  auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2397  auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2398  ? TailBBFreq * TailToHeadProb.getCompl()
2399  : TailToHeadFreq;
2400  Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2401  ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2402  }
2403  }
2404 
2405  LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2406  << getBlockName(*Iter)
2407  << " to the top: " << Cost.getFrequency() << "\n");
2408 
2409  if (Cost < SmallestRotationCost) {
2410  SmallestRotationCost = Cost;
2411  RotationPos = Iter;
2412  }
2413  }
2414 
2415  if (RotationPos != LoopChain.end()) {
2416  LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2417  << " to the top\n");
2418  std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2419  }
2420 }
2421 
2422 /// Collect blocks in the given loop that are to be placed.
2423 ///
2424 /// When profile data is available, exclude cold blocks from the returned set;
2425 /// otherwise, collect all blocks in the loop.
2427 MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2428  BlockFilterSet LoopBlockSet;
2429 
2430  // Filter cold blocks off from LoopBlockSet when profile data is available.
2431  // Collect the sum of frequencies of incoming edges to the loop header from
2432  // outside. If we treat the loop as a super block, this is the frequency of
2433  // the loop. Then for each block in the loop, we calculate the ratio between
2434  // its frequency and the frequency of the loop block. When it is too small,
2435  // don't add it to the loop chain. If there are outer loops, then this block
2436  // will be merged into the first outer loop chain for which this block is not
2437  // cold anymore. This needs precise profile data and we only do this when
2438  // profile data is available.
2440  BlockFrequency LoopFreq(0);
2441  for (auto LoopPred : L.getHeader()->predecessors())
2442  if (!L.contains(LoopPred))
2443  LoopFreq += MBFI->getBlockFreq(LoopPred) *
2444  MBPI->getEdgeProbability(LoopPred, L.getHeader());
2445 
2446  for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2447  auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2448  if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2449  continue;
2450  LoopBlockSet.insert(LoopBB);
2451  }
2452  } else
2453  LoopBlockSet.insert(L.block_begin(), L.block_end());
2454 
2455  return LoopBlockSet;
2456 }
2457 
2458 /// Forms basic block chains from the natural loop structures.
2459 ///
2460 /// These chains are designed to preserve the existing *structure* of the code
2461 /// as much as possible. We can then stitch the chains together in a way which
2462 /// both preserves the topological structure and minimizes taken conditional
2463 /// branches.
2464 void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2465  // First recurse through any nested loops, building chains for those inner
2466  // loops.
2467  for (const MachineLoop *InnerLoop : L)
2468  buildLoopChains(*InnerLoop);
2469 
2470  assert(BlockWorkList.empty() &&
2471  "BlockWorkList not empty when starting to build loop chains.");
2472  assert(EHPadWorkList.empty() &&
2473  "EHPadWorkList not empty when starting to build loop chains.");
2474  BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2475 
2476  // Check if we have profile data for this function. If yes, we will rotate
2477  // this loop by modeling costs more precisely which requires the profile data
2478  // for better layout.
2479  bool RotateLoopWithProfile =
2482 
2483  // First check to see if there is an obviously preferable top block for the
2484  // loop. This will default to the header, but may end up as one of the
2485  // predecessors to the header if there is one which will result in strictly
2486  // fewer branches in the loop body.
2487  MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2488 
2489  // If we selected just the header for the loop top, look for a potentially
2490  // profitable exit block in the event that rotating the loop can eliminate
2491  // branches by placing an exit edge at the bottom.
2492  //
2493  // Loops are processed innermost to uttermost, make sure we clear
2494  // PreferredLoopExit before processing a new loop.
2495  PreferredLoopExit = nullptr;
2496  BlockFrequency ExitFreq;
2497  if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2498  PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2499 
2500  BlockChain &LoopChain = *BlockToChain[LoopTop];
2501 
2502  // FIXME: This is a really lame way of walking the chains in the loop: we
2503  // walk the blocks, and use a set to prevent visiting a particular chain
2504  // twice.
2505  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2506  assert(LoopChain.UnscheduledPredecessors == 0 &&
2507  "LoopChain should not have unscheduled predecessors.");
2508  UpdatedPreds.insert(&LoopChain);
2509 
2510  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2511  fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2512 
2513  buildChain(LoopTop, LoopChain, &LoopBlockSet);
2514 
2515  if (RotateLoopWithProfile)
2516  rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2517  else
2518  rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2519 
2520  LLVM_DEBUG({
2521  // Crash at the end so we get all of the debugging output first.
2522  bool BadLoop = false;
2523  if (LoopChain.UnscheduledPredecessors) {
2524  BadLoop = true;
2525  dbgs() << "Loop chain contains a block without its preds placed!\n"
2526  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2527  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2528  }
2529  for (MachineBasicBlock *ChainBB : LoopChain) {
2530  dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2531  if (!LoopBlockSet.remove(ChainBB)) {
2532  // We don't mark the loop as bad here because there are real situations
2533  // where this can occur. For example, with an unanalyzable fallthrough
2534  // from a loop block to a non-loop block or vice versa.
2535  dbgs() << "Loop chain contains a block not contained by the loop!\n"
2536  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2537  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2538  << " Bad block: " << getBlockName(ChainBB) << "\n";
2539  }
2540  }
2541 
2542  if (!LoopBlockSet.empty()) {
2543  BadLoop = true;
2544  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2545  dbgs() << "Loop contains blocks never placed into a chain!\n"
2546  << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2547  << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2548  << " Bad block: " << getBlockName(LoopBB) << "\n";
2549  }
2550  assert(!BadLoop && "Detected problems with the placement of this loop.");
2551  });
2552 
2553  BlockWorkList.clear();
2554  EHPadWorkList.clear();
2555 }
2556 
2557 void MachineBlockPlacement::buildCFGChains() {
2558  // Ensure that every BB in the function has an associated chain to simplify
2559  // the assumptions of the remaining algorithm.
2560  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2561  for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2562  ++FI) {
2563  MachineBasicBlock *BB = &*FI;
2564  BlockChain *Chain =
2565  new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2566  // Also, merge any blocks which we cannot reason about and must preserve
2567  // the exact fallthrough behavior for.
2568  while (true) {
2569  Cond.clear();
2570  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2571  if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2572  break;
2573 
2574  MachineFunction::iterator NextFI = std::next(FI);
2575  MachineBasicBlock *NextBB = &*NextFI;
2576  // Ensure that the layout successor is a viable block, as we know that
2577  // fallthrough is a possibility.
2578  assert(NextFI != FE && "Can't fallthrough past the last block.");
2579  LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2580  << getBlockName(BB) << " -> " << getBlockName(NextBB)
2581  << "\n");
2582  Chain->merge(NextBB, nullptr);
2583 #ifndef NDEBUG
2584  BlocksWithUnanalyzableExits.insert(&*BB);
2585 #endif
2586  FI = NextFI;
2587  BB = NextBB;
2588  }
2589  }
2590 
2591  // Build any loop-based chains.
2592  PreferredLoopExit = nullptr;
2593  for (MachineLoop *L : *MLI)
2594  buildLoopChains(*L);
2595 
2596  assert(BlockWorkList.empty() &&
2597  "BlockWorkList should be empty before building final chain.");
2598  assert(EHPadWorkList.empty() &&
2599  "EHPadWorkList should be empty before building final chain.");
2600 
2601  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2602  for (MachineBasicBlock &MBB : *F)
2603  fillWorkLists(&MBB, UpdatedPreds);
2604 
2605  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2606  buildChain(&F->front(), FunctionChain);
2607 
2608 #ifndef NDEBUG
2609  using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2610 #endif
2611  LLVM_DEBUG({
2612  // Crash at the end so we get all of the debugging output first.
2613  bool BadFunc = false;
2614  FunctionBlockSetType FunctionBlockSet;
2615  for (MachineBasicBlock &MBB : *F)
2616  FunctionBlockSet.insert(&MBB);
2617 
2618  for (MachineBasicBlock *ChainBB : FunctionChain)
2619  if (!FunctionBlockSet.erase(ChainBB)) {
2620  BadFunc = true;
2621  dbgs() << "Function chain contains a block not in the function!\n"
2622  << " Bad block: " << getBlockName(ChainBB) << "\n";
2623  }
2624 
2625  if (!FunctionBlockSet.empty()) {
2626  BadFunc = true;
2627  for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2628  dbgs() << "Function contains blocks never placed into a chain!\n"
2629  << " Bad block: " << getBlockName(RemainingBB) << "\n";
2630  }
2631  assert(!BadFunc && "Detected problems with the block placement.");
2632  });
2633 
2634  // Splice the blocks into place.
2635  MachineFunction::iterator InsertPos = F->begin();
2636  LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2637  for (MachineBasicBlock *ChainBB : FunctionChain) {
2638  LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2639  : " ... ")
2640  << getBlockName(ChainBB) << "\n");
2641  if (InsertPos != MachineFunction::iterator(ChainBB))
2642  F->splice(InsertPos, ChainBB);
2643  else
2644  ++InsertPos;
2645 
2646  // Update the terminator of the previous block.
2647  if (ChainBB == *FunctionChain.begin())
2648  continue;
2649  MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2650 
2651  // FIXME: It would be awesome of updateTerminator would just return rather
2652  // than assert when the branch cannot be analyzed in order to remove this
2653  // boiler plate.
2654  Cond.clear();
2655  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2656 
2657 #ifndef NDEBUG
2658  if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2659  // Given the exact block placement we chose, we may actually not _need_ to
2660  // be able to edit PrevBB's terminator sequence, but not being _able_ to
2661  // do that at this point is a bug.
2662  assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2663  !PrevBB->canFallThrough()) &&
2664  "Unexpected block with un-analyzable fallthrough!");
2665  Cond.clear();
2666  TBB = FBB = nullptr;
2667  }
2668 #endif
2669 
2670  // The "PrevBB" is not yet updated to reflect current code layout, so,
2671  // o. it may fall-through to a block without explicit "goto" instruction
2672  // before layout, and no longer fall-through it after layout; or
2673  // o. just opposite.
2674  //
2675  // analyzeBranch() may return erroneous value for FBB when these two
2676  // situations take place. For the first scenario FBB is mistakenly set NULL;
2677  // for the 2nd scenario, the FBB, which is expected to be NULL, is
2678  // mistakenly pointing to "*BI".
2679  // Thus, if the future change needs to use FBB before the layout is set, it
2680  // has to correct FBB first by using the code similar to the following:
2681  //
2682  // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2683  // PrevBB->updateTerminator();
2684  // Cond.clear();
2685  // TBB = FBB = nullptr;
2686  // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2687  // // FIXME: This should never take place.
2688  // TBB = FBB = nullptr;
2689  // }
2690  // }
2691  if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond))
2692  PrevBB->updateTerminator();
2693  }
2694 
2695  // Fixup the last block.
2696  Cond.clear();
2697  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2698  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond))
2699  F->back().updateTerminator();
2700 
2701  BlockWorkList.clear();
2702  EHPadWorkList.clear();
2703 }
2704 
2705 void MachineBlockPlacement::optimizeBranches() {
2706  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2707  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
2708 
2709  // Now that all the basic blocks in the chain have the proper layout,
2710  // make a final call to AnalyzeBranch with AllowModify set.
2711  // Indeed, the target may be able to optimize the branches in a way we
2712  // cannot because all branches may not be analyzable.
2713  // E.g., the target may be able to remove an unconditional branch to
2714  // a fallthrough when it occurs after predicated terminators.
2715  for (MachineBasicBlock *ChainBB : FunctionChain) {
2716  Cond.clear();
2717  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
2718  if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
2719  // If PrevBB has a two-way branch, try to re-order the branches
2720  // such that we branch to the successor with higher probability first.
2721  if (TBB && !Cond.empty() && FBB &&
2722  MBPI->getEdgeProbability(ChainBB, FBB) >
2723  MBPI->getEdgeProbability(ChainBB, TBB) &&
2724  !TII->reverseBranchCondition(Cond)) {
2725  LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2726  << getBlockName(ChainBB) << "\n");
2727  LLVM_DEBUG(dbgs() << " Edge probability: "
2728  << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
2729  << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
2730  DebugLoc dl; // FIXME: this is nowhere
2731  TII->removeBranch(*ChainBB);
2732  TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
2733  ChainBB->updateTerminator();
2734  }
2735  }
2736  }
2737 }
2738 
2739 void MachineBlockPlacement::alignBlocks() {
2740  // Walk through the backedges of the function now that we have fully laid out
2741  // the basic blocks and align the destination of each backedge. We don't rely
2742  // exclusively on the loop info here so that we can align backedges in
2743  // unnatural CFGs and backedges that were introduced purely because of the
2744  // loop rotations done during this layout pass.
2745  if (F->getFunction().hasMinSize() ||
2746  (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
2747  return;
2748  BlockChain &FunctionChain = *BlockToChain[&F->front()];
2749  if (FunctionChain.begin() == FunctionChain.end())
2750  return; // Empty chain.
2751 
2752  const BranchProbability ColdProb(1, 5); // 20%
2753  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2754  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2755  for (MachineBasicBlock *ChainBB : FunctionChain) {
2756  if (ChainBB == *FunctionChain.begin())
2757  continue;
2758 
2759  // Don't align non-looping basic blocks. These are unlikely to execute
2760  // enough times to matter in practice. Note that we'll still handle
2761  // unnatural CFGs inside of a natural outer loop (the common case) and
2762  // rotated loops.
2763  MachineLoop *L = MLI->getLoopFor(ChainBB);
2764  if (!L)
2765  continue;
2766 
2767  const Align Align = TLI->getPrefLoopAlignment(L);
2768  if (Align == 1)
2769  continue; // Don't care about loop alignment.
2770 
2771  // If the block is cold relative to the function entry don't waste space
2772  // aligning it.
2773  BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
2774  if (Freq < WeightedEntryFreq)
2775  continue;
2776 
2777  // If the block is cold relative to its loop header, don't align it
2778  // regardless of what edges into the block exist.
2779  MachineBasicBlock *LoopHeader = L->getHeader();
2780  BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
2781  if (Freq < (LoopHeaderFreq * ColdProb))
2782  continue;
2783 
2784  // Check for the existence of a non-layout predecessor which would benefit
2785  // from aligning this block.
2786  MachineBasicBlock *LayoutPred =
2787  &*std::prev(MachineFunction::iterator(ChainBB));
2788 
2789  // Force alignment if all the predecessors are jumps. We already checked
2790  // that the block isn't cold above.
2791  if (!LayoutPred->isSuccessor(ChainBB)) {
2792  ChainBB->setAlignment(Align);
2793  continue;
2794  }
2795 
2796  // Align this block if the layout predecessor's edge into this block is
2797  // cold relative to the block. When this is true, other predecessors make up
2798  // all of the hot entries into the block and thus alignment is likely to be
2799  // important.
2800  BranchProbability LayoutProb =
2801  MBPI->getEdgeProbability(LayoutPred, ChainBB);
2802  BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
2803  if (LayoutEdgeFreq <= (Freq * ColdProb))
2804  ChainBB->setAlignment(Align);
2805  }
2806 }
2807 
2808 /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
2809 /// it was duplicated into its chain predecessor and removed.
2810 /// \p BB - Basic block that may be duplicated.
2811 ///
2812 /// \p LPred - Chosen layout predecessor of \p BB.
2813 /// Updated to be the chain end if LPred is removed.
2814 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2815 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2816 /// Used to identify which blocks to update predecessor
2817 /// counts.
2818 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2819 /// chosen in the given order due to unnatural CFG
2820 /// only needed if \p BB is removed and
2821 /// \p PrevUnplacedBlockIt pointed to \p BB.
2822 /// @return true if \p BB was removed.
2823 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
2824  MachineBasicBlock *BB, MachineBasicBlock *&LPred,
2825  const MachineBasicBlock *LoopHeaderBB,
2826  BlockChain &Chain, BlockFilterSet *BlockFilter,
2827  MachineFunction::iterator &PrevUnplacedBlockIt) {
2828  bool Removed, DuplicatedToLPred;
2829  bool DuplicatedToOriginalLPred;
2830  Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
2831  PrevUnplacedBlockIt,
2832  DuplicatedToLPred);
2833  if (!Removed)
2834  return false;
2835  DuplicatedToOriginalLPred = DuplicatedToLPred;
2836  // Iteratively try to duplicate again. It can happen that a block that is
2837  // duplicated into is still small enough to be duplicated again.
2838  // No need to call markBlockSuccessors in this case, as the blocks being
2839  // duplicated from here on are already scheduled.
2840  // Note that DuplicatedToLPred always implies Removed.
2841  while (DuplicatedToLPred) {
2842  assert(Removed && "Block must have been removed to be duplicated into its "
2843  "layout predecessor.");
2844  MachineBasicBlock *DupBB, *DupPred;
2845  // The removal callback causes Chain.end() to be updated when a block is
2846  // removed. On the first pass through the loop, the chain end should be the
2847  // same as it was on function entry. On subsequent passes, because we are
2848  // duplicating the block at the end of the chain, if it is removed the
2849  // chain will have shrunk by one block.
2850  BlockChain::iterator ChainEnd = Chain.end();
2851  DupBB = *(--ChainEnd);
2852  // Now try to duplicate again.
2853  if (ChainEnd == Chain.begin())
2854  break;
2855  DupPred = *std::prev(ChainEnd);
2856  Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
2857  PrevUnplacedBlockIt,
2858  DuplicatedToLPred);
2859  }
2860  // If BB was duplicated into LPred, it is now scheduled. But because it was
2861  // removed, markChainSuccessors won't be called for its chain. Instead we
2862  // call markBlockSuccessors for LPred to achieve the same effect. This must go
2863  // at the end because repeating the tail duplication can increase the number
2864  // of unscheduled predecessors.
2865  LPred = *std::prev(Chain.end());
2866  if (DuplicatedToOriginalLPred)
2867  markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
2868  return true;
2869 }
2870 
2871 /// Tail duplicate \p BB into (some) predecessors if profitable.
2872 /// \p BB - Basic block that may be duplicated
2873 /// \p LPred - Chosen layout predecessor of \p BB
2874 /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
2875 /// \p BlockFilter - Set of blocks that belong to the loop being laid out.
2876 /// Used to identify which blocks to update predecessor
2877 /// counts.
2878 /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
2879 /// chosen in the given order due to unnatural CFG
2880 /// only needed if \p BB is removed and
2881 /// \p PrevUnplacedBlockIt pointed to \p BB.
2882 /// \p DuplicatedToLPred - True if the block was duplicated into LPred. Will
2883 /// only be true if the block was removed.
2884 /// \return - True if the block was duplicated into all preds and removed.
2885 bool MachineBlockPlacement::maybeTailDuplicateBlock(
2887  BlockChain &Chain, BlockFilterSet *BlockFilter,
2888  MachineFunction::iterator &PrevUnplacedBlockIt,
2889  bool &DuplicatedToLPred) {
2890  DuplicatedToLPred = false;
2891  if (!shouldTailDuplicate(BB))
2892  return false;
2893 
2894  LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
2895  << "\n");
2896 
2897  // This has to be a callback because none of it can be done after
2898  // BB is deleted.
2899  bool Removed = false;
2900  auto RemovalCallback =
2901  [&](MachineBasicBlock *RemBB) {
2902  // Signal to outer function
2903  Removed = true;
2904 
2905  // Conservative default.
2906  bool InWorkList = true;
2907  // Remove from the Chain and Chain Map
2908  if (BlockToChain.count(RemBB)) {
2909  BlockChain *Chain = BlockToChain[RemBB];
2910  InWorkList = Chain->UnscheduledPredecessors == 0;
2911  Chain->remove(RemBB);
2912  BlockToChain.erase(RemBB);
2913  }
2914 
2915  // Handle the unplaced block iterator
2916  if (&(*PrevUnplacedBlockIt) == RemBB) {
2917  PrevUnplacedBlockIt++;
2918  }
2919 
2920  // Handle the Work Lists
2921  if (InWorkList) {
2922  SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
2923  if (RemBB->isEHPad())
2924  RemoveList = EHPadWorkList;
2925  RemoveList.erase(
2926  llvm::remove_if(RemoveList,
2927  [RemBB](MachineBasicBlock *BB) {
2928  return BB == RemBB;
2929  }),
2930  RemoveList.end());
2931  }
2932 
2933  // Handle the filter set
2934  if (BlockFilter) {
2935  BlockFilter->remove(RemBB);
2936  }
2937 
2938  // Remove the block from loop info.
2939  MLI->removeBlock(RemBB);
2940  if (RemBB == PreferredLoopExit)
2941  PreferredLoopExit = nullptr;
2942 
2943  LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
2944  << getBlockName(RemBB) << "\n");
2945  };
2946  auto RemovalCallbackRef =
2947  function_ref<void(MachineBasicBlock*)>(RemovalCallback);
2948 
2949  SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
2950  bool IsSimple = TailDup.isSimpleBB(BB);
2951  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
2952  &DuplicatedPreds, &RemovalCallbackRef);
2953 
2954  // Update UnscheduledPredecessors to reflect tail-duplication.
2955  DuplicatedToLPred = false;
2956  for (MachineBasicBlock *Pred : DuplicatedPreds) {
2957  // We're only looking for unscheduled predecessors that match the filter.
2958  BlockChain* PredChain = BlockToChain[Pred];
2959  if (Pred == LPred)
2960  DuplicatedToLPred = true;
2961  if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
2962  || PredChain == &Chain)
2963  continue;
2964  for (MachineBasicBlock *NewSucc : Pred->successors()) {
2965  if (BlockFilter && !BlockFilter->count(NewSucc))
2966  continue;
2967  BlockChain *NewChain = BlockToChain[NewSucc];
2968  if (NewChain != &Chain && NewChain != PredChain)
2969  NewChain->UnscheduledPredecessors++;
2970  }
2971  }
2972  return Removed;
2973 }
2974 
2975 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
2976  if (skipFunction(MF.getFunction()))
2977  return false;
2978 
2979  // Check for single-block functions and skip them.
2980  if (std::next(MF.begin()) == MF.end())
2981  return false;
2982 
2983  F = &MF;
2984  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
2985  MBFI = std::make_unique<BranchFolder::MBFIWrapper>(
2986  getAnalysis<MachineBlockFrequencyInfo>());
2987  MLI = &getAnalysis<MachineLoopInfo>();
2988  TII = MF.getSubtarget().getInstrInfo();
2989  TLI = MF.getSubtarget().getTargetLowering();
2990  MPDT = nullptr;
2991 
2992  // Initialize PreferredLoopExit to nullptr here since it may never be set if
2993  // there are no MachineLoops.
2994  PreferredLoopExit = nullptr;
2995 
2996  assert(BlockToChain.empty() &&
2997  "BlockToChain map should be empty before starting placement.");
2998  assert(ComputedEdges.empty() &&
2999  "Computed Edge map should be empty before starting placement.");
3000 
3001  unsigned TailDupSize = TailDupPlacementThreshold;
3002  // If only the aggressive threshold is explicitly set, use it.
3003  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3004  TailDupPlacementThreshold.getNumOccurrences() == 0)
3006 
3007  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
3008  // For aggressive optimization, we can adjust some thresholds to be less
3009  // conservative.
3010  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
3011  // At O3 we should be more willing to copy blocks for tail duplication. This
3012  // increases size pressure, so we only do it at O3
3013  // Do this unless only the regular threshold is explicitly set.
3014  if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3015  TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3017  }
3018 
3019  if (allowTailDupPlacement()) {
3020  MPDT = &getAnalysis<MachinePostDominatorTree>();
3021  if (MF.getFunction().hasOptSize())
3022  TailDupSize = 1;
3023  bool PreRegAlloc = false;
3024  TailDup.initMF(MF, PreRegAlloc, MBPI, /* LayoutMode */ true, TailDupSize);
3025  precomputeTriangleChains();
3026  }
3027 
3028  buildCFGChains();
3029 
3030  // Changing the layout can create new tail merging opportunities.
3031  // TailMerge can create jump into if branches that make CFG irreducible for
3032  // HW that requires structured CFG.
3033  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3034  PassConfig->getEnableTailMerge() &&
3036  // No tail merging opportunities if the block number is less than four.
3037  if (MF.size() > 3 && EnableTailMerge) {
3038  unsigned TailMergeSize = TailDupSize + 1;
3039  BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
3040  *MBPI, TailMergeSize);
3041 
3042  auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
3043  if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
3044  MMIWP ? &MMIWP->getMMI() : nullptr, MLI,
3045  /*AfterPlacement=*/true)) {
3046  // Redo the layout if tail merging creates/removes/moves blocks.
3047  BlockToChain.clear();
3048  ComputedEdges.clear();
3049  // Must redo the post-dominator tree if blocks were changed.
3050  if (MPDT)
3051  MPDT->runOnMachineFunction(MF);
3052  ChainAllocator.DestroyAll();
3053  buildCFGChains();
3054  }
3055  }
3056 
3057  optimizeBranches();
3058  alignBlocks();
3059 
3060  BlockToChain.clear();
3061  ComputedEdges.clear();
3062  ChainAllocator.DestroyAll();
3063 
3064  if (AlignAllBlock)
3065  // Align all of the blocks in the function to a specific alignment.
3066  for (MachineBasicBlock &MBB : MF)
3067  MBB.setAlignment(Align(1ULL << AlignAllBlock));
3068  else if (AlignAllNonFallThruBlocks) {
3069  // Align all of the blocks that have no fall-through predecessors to a
3070  // specific alignment.
3071  for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
3072  auto LayoutPred = std::prev(MBI);
3073  if (!LayoutPred->isSuccessor(&*MBI))
3074  MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
3075  }
3076  }
3077  if (ViewBlockLayoutWithBFI != GVDT_None &&
3078  (ViewBlockFreqFuncName.empty() ||
3079  F->getFunction().getName().equals(ViewBlockFreqFuncName))) {
3080  MBFI->view("MBP." + MF.getName(), false);
3081  }
3082 
3083 
3084  // We always return true as we have no way to track whether the final order
3085  // differs from the original order.
3086  return true;
3087 }
3088 
3089 namespace {
3090 
3091 /// A pass to compute block placement statistics.
3092 ///
3093 /// A separate pass to compute interesting statistics for evaluating block
3094 /// placement. This is separate from the actual placement pass so that they can
3095 /// be computed in the absence of any placement transformations or when using
3096 /// alternative placement strategies.
3097 class MachineBlockPlacementStats : public MachineFunctionPass {
3098  /// A handle to the branch probability pass.
3099  const MachineBranchProbabilityInfo *MBPI;
3100 
3101  /// A handle to the function-wide block frequency pass.
3102  const MachineBlockFrequencyInfo *MBFI;
3103 
3104 public:
3105  static char ID; // Pass identification, replacement for typeid
3106 
3107  MachineBlockPlacementStats() : MachineFunctionPass(ID) {
3109  }
3110 
3111  bool runOnMachineFunction(MachineFunction &F) override;
3112 
3113  void getAnalysisUsage(AnalysisUsage &AU) const override {
3116  AU.setPreservesAll();
3118  }
3119 };
3120 
3121 } // end anonymous namespace
3122 
3124 
3126 
3127 INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
3128  "Basic Block Placement Stats", false, false)
3131 INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
3132  "Basic Block Placement Stats", false, false)
3133 
3134 bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
3135  // Check for single-block functions and skip them.
3136  if (std::next(F.begin()) == F.end())
3137  return false;
3138 
3139  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
3140  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
3141 
3142  for (MachineBasicBlock &MBB : F) {
3143  BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3144  Statistic &NumBranches =
3145  (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3146  Statistic &BranchTakenFreq =
3147  (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3148  for (MachineBasicBlock *Succ : MBB.successors()) {
3149  // Skip if this successor is a fallthrough.
3150  if (MBB.isLayoutSuccessor(Succ))
3151  continue;
3152 
3153  BlockFrequency EdgeFreq =
3154  BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3155  ++NumBranches;
3156  BranchTakenFreq += EdgeFreq.getFrequency();
3157  }
3158  }
3159 
3160  return false;
3161 }
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:641
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
BranchProbability getCompl() const
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:320
This class represents lattice values for constants.
Definition: AllocatorList.h:23
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:484
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:622
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
unsigned size() const
bool requiresStructuredCFG() const
virtual const TargetLowering * getTargetLowering() const
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
virtual unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const
Insert branch code into the end of the specified MachineBasicBlock.
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLExtras.h:104
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
STATISTIC(NumFunctions, "Total number of functions")
A debug info location.
Definition: DebugLoc.h:33
F(f)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static BranchProbability getOne()
virtual unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const
Remove the branching code at the end of the specific MBB.
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
CodeGenOpt::Level getOptLevel() const
This file defines the MallocAllocator and BumpPtrAllocator interfaces.
iterator_range< succ_iterator > successors()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:195
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:455
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii, const TargetRegisterInfo *tri, MachineModuleInfo *mmi, MachineLoopInfo *mli=nullptr, bool AfterPlacement=false)
Perhaps branch folding, tail merging and other CFG optimizations on the given function.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:41
void removeBlock(MachineBasicBlock *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
block placement Basic Block Placement Stats
Target-Independent Code Generator Pass Configuration Options.
BlockT * getHeader() const
Definition: LoopInfo.h:105
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:261
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:467
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
virtual const TargetInstrInfo * getInstrInfo() const
cl::opt< std::string > ViewBlockFreqFuncName
auto count(R &&Range, const E &Element) -> typename std::iterator_traits< decltype(adl_begin(Range))>::difference_type
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1231
TargetInstrInfo - Interface to description of machine instruction set.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
cl::opt< unsigned > ProfileLikelyProb
bool getEnableTailMerge() const
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
#define P(N)
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock *> *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
cl::opt< GVDAGType > ViewBlockLayoutWithBFI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:290
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
succ_reverse_iterator succ_rbegin()
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:91
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
Represent the analysis usage information of a pass.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, uint64_t EntryFreq)
Compare 2 BlockFrequency&#39;s with a small penalty for A.
iterator_range< pred_iterator > predecessors()
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition: Function.h:308
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:496
auto remove_if(R &&Range, UnaryPredicate P) -> decltype(adl_begin(Range))
Provide wrappers to std::remove_if which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1205
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e...
iterator erase(const_iterator CI)
Definition: SmallVector.h:434
const MachineBasicBlock & front() const
size_t size() const
Definition: SmallVector.h:52
void initializeMachineBlockPlacementPass(PassRegistry &)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1186
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:40
Branch Probability Basic Block Placement
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
size_type size() const
Definition: SmallPtrSet.h:92
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:115
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:297
Iterator for intrusive lists based on ilist_node.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
void setAlignment(Align A)
Set alignment of the basic block.
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:389
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
loop rotate
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
void updateTerminator()
Update the terminator instructions in block to account for changes to the layout. ...
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_END(MachineBlockPlacement
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:446
unsigned pred_size() const
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
void setPreservesAll()
Set by analyses that do not transform their input at all.
typename SuperClass::iterator iterator
Definition: SmallVector.h:319
unsigned succ_size() const
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
cl::opt< unsigned > StaticLikelyProb
LLVM_NODISCARD bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:174
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:168
unsigned succ_size(const Instruction *I)
Definition: CFG.h:256
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
static uint64_t getMaxFrequency()
Returns the maximum possible frequency, the saturation value.
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:154
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock *> &Successors)
Check if BB has exactly the successors in Successors.
#define I(x, y, z)
Definition: MD5.cpp:58
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:619
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
block_iterator block_end() const
Definition: LoopInfo.h:160
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:185
virtual bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const
Reverses the branch condition of the specified condition list, returning false on success and true if...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void stable_sort(R &&Range)
Definition: STLExtras.h:1289
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:503
#define DEBUG_TYPE
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
static BranchProbability getZero()
Utility class to perform tail duplication.
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
#define LLVM_DEBUG(X)
Definition: Debug.h:122
for(unsigned i=Desc.getNumOperands(), e=OldMI.getNumOperands();i !=e;++i)
block_iterator block_begin() const
Definition: LoopInfo.h:159
uint32_t getNumerator() const
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
This file describes how to lower LLVM code to machine code.