LLVM 19.0.0git
MachineBlockPlacement.cpp
Go to the documentation of this file.
1//===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements basic block placement transformations using the CFG
10// structure and branch probability estimates.
11//
12// The pass strives to preserve the structure of the CFG (that is, retain
13// a topological ordering of basic blocks) in the absence of a *strong* signal
14// to the contrary from probabilities. However, within the CFG structure, it
15// attempts to choose an ordering which favors placing more likely sequences of
16// blocks adjacent to each other.
17//
18// The algorithm works from the inner-most loop within a function outward, and
19// at each stage walks through the basic blocks, trying to coalesce them into
20// sequential chains where allowed by the CFG (or demanded by heavy
21// probabilities). Finally, it walks the blocks in topological order, and the
22// first time it reaches a chain of basic blocks, it schedules them in the
23// function in-order.
24//
25//===----------------------------------------------------------------------===//
26
27#include "BranchFolding.h"
28#include "llvm/ADT/ArrayRef.h"
29#include "llvm/ADT/DenseMap.h"
30#include "llvm/ADT/STLExtras.h"
31#include "llvm/ADT/SetVector.h"
34#include "llvm/ADT/Statistic.h"
51#include "llvm/IR/DebugLoc.h"
52#include "llvm/IR/Function.h"
53#include "llvm/IR/PrintPasses.h"
55#include "llvm/Pass.h"
62#include "llvm/Support/Debug.h"
66#include <algorithm>
67#include <cassert>
68#include <cstdint>
69#include <iterator>
70#include <memory>
71#include <string>
72#include <tuple>
73#include <utility>
74#include <vector>
75
76using namespace llvm;
77
78#define DEBUG_TYPE "block-placement"
79
80STATISTIC(NumCondBranches, "Number of conditional branches");
81STATISTIC(NumUncondBranches, "Number of unconditional branches");
82STATISTIC(CondBranchTakenFreq,
83 "Potential frequency of taking conditional branches");
84STATISTIC(UncondBranchTakenFreq,
85 "Potential frequency of taking unconditional branches");
86
88 "align-all-blocks",
89 cl::desc("Force the alignment of all blocks in the function in log2 format "
90 "(e.g 4 means align on 16B boundaries)."),
92
94 "align-all-nofallthru-blocks",
95 cl::desc("Force the alignment of all blocks that have no fall-through "
96 "predecessors (i.e. don't add nops that are executed). In log2 "
97 "format (e.g 4 means align on 16B boundaries)."),
99
101 "max-bytes-for-alignment",
102 cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
103 "alignment"),
104 cl::init(0), cl::Hidden);
105
106// FIXME: Find a good default for this flag and remove the flag.
108 "block-placement-exit-block-bias",
109 cl::desc("Block frequency percentage a loop exit block needs "
110 "over the original exit to be considered the new exit."),
111 cl::init(0), cl::Hidden);
112
113// Definition:
114// - Outlining: placement of a basic block outside the chain or hot path.
115
117 "loop-to-cold-block-ratio",
118 cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
119 "(frequency of block) is greater than this ratio"),
120 cl::init(5), cl::Hidden);
121
123 "force-loop-cold-block",
124 cl::desc("Force outlining cold blocks from loops."),
125 cl::init(false), cl::Hidden);
126
127static cl::opt<bool>
128 PreciseRotationCost("precise-rotation-cost",
129 cl::desc("Model the cost of loop rotation more "
130 "precisely by using profile data."),
131 cl::init(false), cl::Hidden);
132
133static cl::opt<bool>
134 ForcePreciseRotationCost("force-precise-rotation-cost",
135 cl::desc("Force the use of precise cost "
136 "loop rotation strategy."),
137 cl::init(false), cl::Hidden);
138
140 "misfetch-cost",
141 cl::desc("Cost that models the probabilistic risk of an instruction "
142 "misfetch due to a jump comparing to falling through, whose cost "
143 "is zero."),
144 cl::init(1), cl::Hidden);
145
146static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
147 cl::desc("Cost of jump instructions."),
148 cl::init(1), cl::Hidden);
149static cl::opt<bool>
150TailDupPlacement("tail-dup-placement",
151 cl::desc("Perform tail duplication during placement. "
152 "Creates more fallthrough opportunites in "
153 "outline branches."),
154 cl::init(true), cl::Hidden);
155
156static cl::opt<bool>
157BranchFoldPlacement("branch-fold-placement",
158 cl::desc("Perform branch folding during placement. "
159 "Reduces code size."),
160 cl::init(true), cl::Hidden);
161
162// Heuristic for tail duplication.
164 "tail-dup-placement-threshold",
165 cl::desc("Instruction cutoff for tail duplication during layout. "
166 "Tail merging during layout is forced to have a threshold "
167 "that won't conflict."), cl::init(2),
168 cl::Hidden);
169
170// Heuristic for aggressive tail duplication.
172 "tail-dup-placement-aggressive-threshold",
173 cl::desc("Instruction cutoff for aggressive tail duplication during "
174 "layout. Used at -O3. Tail merging during layout is forced to "
175 "have a threshold that won't conflict."), cl::init(4),
176 cl::Hidden);
177
178// Heuristic for tail duplication.
180 "tail-dup-placement-penalty",
181 cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
182 "Copying can increase fallthrough, but it also increases icache "
183 "pressure. This parameter controls the penalty to account for that. "
184 "Percent as integer."),
185 cl::init(2),
186 cl::Hidden);
187
188// Heuristic for tail duplication if profile count is used in cost model.
190 "tail-dup-profile-percent-threshold",
191 cl::desc("If profile count information is used in tail duplication cost "
192 "model, the gained fall through number from tail duplication "
193 "should be at least this percent of hot count."),
194 cl::init(50), cl::Hidden);
195
196// Heuristic for triangle chains.
198 "triangle-chain-count",
199 cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
200 "triangle tail duplication heuristic to kick in. 0 to disable."),
201 cl::init(2),
202 cl::Hidden);
203
204// Use case: When block layout is visualized after MBP pass, the basic blocks
205// are labeled in layout order; meanwhile blocks could be numbered in a
206// different order. It's hard to map between the graph and pass output.
207// With this option on, the basic blocks are renumbered in function layout
208// order. For debugging only.
210 "renumber-blocks-before-view",
211 cl::desc(
212 "If true, basic blocks are re-numbered before MBP layout is printed "
213 "into a dot graph. Only used when a function is being printed."),
214 cl::init(false), cl::Hidden);
215
216namespace llvm {
221
222// Internal option used to control BFI display only after MBP pass.
223// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
224// -view-block-layout-with-bfi=
226
227// Command line option to specify the name of the function for CFG dump
228// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
230} // namespace llvm
231
232namespace {
233
234class BlockChain;
235
236/// Type for our function-wide basic block -> block chain mapping.
237using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
238
239/// A chain of blocks which will be laid out contiguously.
240///
241/// This is the datastructure representing a chain of consecutive blocks that
242/// are profitable to layout together in order to maximize fallthrough
243/// probabilities and code locality. We also can use a block chain to represent
244/// a sequence of basic blocks which have some external (correctness)
245/// requirement for sequential layout.
246///
247/// Chains can be built around a single basic block and can be merged to grow
248/// them. They participate in a block-to-chain mapping, which is updated
249/// automatically as chains are merged together.
250class BlockChain {
251 /// The sequence of blocks belonging to this chain.
252 ///
253 /// This is the sequence of blocks for a particular chain. These will be laid
254 /// out in-order within the function.
256
257 /// A handle to the function-wide basic block to block chain mapping.
258 ///
259 /// This is retained in each block chain to simplify the computation of child
260 /// block chains for SCC-formation and iteration. We store the edges to child
261 /// basic blocks, and map them back to their associated chains using this
262 /// structure.
263 BlockToChainMapType &BlockToChain;
264
265public:
266 /// Construct a new BlockChain.
267 ///
268 /// This builds a new block chain representing a single basic block in the
269 /// function. It also registers itself as the chain that block participates
270 /// in with the BlockToChain mapping.
271 BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
272 : Blocks(1, BB), BlockToChain(BlockToChain) {
273 assert(BB && "Cannot create a chain with a null basic block");
274 BlockToChain[BB] = this;
275 }
276
277 /// Iterator over blocks within the chain.
280
281 /// Beginning of blocks within the chain.
282 iterator begin() { return Blocks.begin(); }
283 const_iterator begin() const { return Blocks.begin(); }
284
285 /// End of blocks within the chain.
286 iterator end() { return Blocks.end(); }
287 const_iterator end() const { return Blocks.end(); }
288
289 bool remove(MachineBasicBlock* BB) {
290 for(iterator i = begin(); i != end(); ++i) {
291 if (*i == BB) {
292 Blocks.erase(i);
293 return true;
294 }
295 }
296 return false;
297 }
298
299 /// Merge a block chain into this one.
300 ///
301 /// This routine merges a block chain into this one. It takes care of forming
302 /// a contiguous sequence of basic blocks, updating the edge list, and
303 /// updating the block -> chain mapping. It does not free or tear down the
304 /// old chain, but the old chain's block list is no longer valid.
305 void merge(MachineBasicBlock *BB, BlockChain *Chain) {
306 assert(BB && "Can't merge a null block.");
307 assert(!Blocks.empty() && "Can't merge into an empty chain.");
308
309 // Fast path in case we don't have a chain already.
310 if (!Chain) {
311 assert(!BlockToChain[BB] &&
312 "Passed chain is null, but BB has entry in BlockToChain.");
313 Blocks.push_back(BB);
314 BlockToChain[BB] = this;
315 return;
316 }
317
318 assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
319 assert(Chain->begin() != Chain->end());
320
321 // Update the incoming blocks to point to this chain, and add them to the
322 // chain structure.
323 for (MachineBasicBlock *ChainBB : *Chain) {
324 Blocks.push_back(ChainBB);
325 assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
326 BlockToChain[ChainBB] = this;
327 }
328 }
329
330#ifndef NDEBUG
331 /// Dump the blocks in this chain.
332 LLVM_DUMP_METHOD void dump() {
333 for (MachineBasicBlock *MBB : *this)
334 MBB->dump();
335 }
336#endif // NDEBUG
337
338 /// Count of predecessors of any block within the chain which have not
339 /// yet been scheduled. In general, we will delay scheduling this chain
340 /// until those predecessors are scheduled (or we find a sufficiently good
341 /// reason to override this heuristic.) Note that when forming loop chains,
342 /// blocks outside the loop are ignored and treated as if they were already
343 /// scheduled.
344 ///
345 /// Note: This field is reinitialized multiple times - once for each loop,
346 /// and then once for the function as a whole.
347 unsigned UnscheduledPredecessors = 0;
348};
349
350class MachineBlockPlacement : public MachineFunctionPass {
351 /// A type for a block filter set.
353
354 /// Pair struct containing basic block and taildup profitability
355 struct BlockAndTailDupResult {
356 MachineBasicBlock *BB = nullptr;
357 bool ShouldTailDup;
358 };
359
360 /// Triple struct containing edge weight and the edge.
361 struct WeightedEdge {
362 BlockFrequency Weight;
363 MachineBasicBlock *Src = nullptr;
364 MachineBasicBlock *Dest = nullptr;
365 };
366
367 /// work lists of blocks that are ready to be laid out
370
371 /// Edges that have already been computed as optimal.
373
374 /// Machine Function
375 MachineFunction *F = nullptr;
376
377 /// A handle to the branch probability pass.
378 const MachineBranchProbabilityInfo *MBPI = nullptr;
379
380 /// A handle to the function-wide block frequency pass.
381 std::unique_ptr<MBFIWrapper> MBFI;
382
383 /// A handle to the loop info.
384 MachineLoopInfo *MLI = nullptr;
385
386 /// Preferred loop exit.
387 /// Member variable for convenience. It may be removed by duplication deep
388 /// in the call stack.
389 MachineBasicBlock *PreferredLoopExit = nullptr;
390
391 /// A handle to the target's instruction info.
392 const TargetInstrInfo *TII = nullptr;
393
394 /// A handle to the target's lowering info.
395 const TargetLoweringBase *TLI = nullptr;
396
397 /// A handle to the post dominator tree.
398 MachinePostDominatorTree *MPDT = nullptr;
399
400 ProfileSummaryInfo *PSI = nullptr;
401
402 /// Duplicator used to duplicate tails during placement.
403 ///
404 /// Placement decisions can open up new tail duplication opportunities, but
405 /// since tail duplication affects placement decisions of later blocks, it
406 /// must be done inline.
407 TailDuplicator TailDup;
408
409 /// Partial tail duplication threshold.
410 BlockFrequency DupThreshold;
411
412 /// True: use block profile count to compute tail duplication cost.
413 /// False: use block frequency to compute tail duplication cost.
414 bool UseProfileCount = false;
415
416 /// Allocator and owner of BlockChain structures.
417 ///
418 /// We build BlockChains lazily while processing the loop structure of
419 /// a function. To reduce malloc traffic, we allocate them using this
420 /// slab-like allocator, and destroy them after the pass completes. An
421 /// important guarantee is that this allocator produces stable pointers to
422 /// the chains.
424
425 /// Function wide BasicBlock to BlockChain mapping.
426 ///
427 /// This mapping allows efficiently moving from any given basic block to the
428 /// BlockChain it participates in, if any. We use it to, among other things,
429 /// allow implicitly defining edges between chains as the existing edges
430 /// between basic blocks.
432
433#ifndef NDEBUG
434 /// The set of basic blocks that have terminators that cannot be fully
435 /// analyzed. These basic blocks cannot be re-ordered safely by
436 /// MachineBlockPlacement, and we must preserve physical layout of these
437 /// blocks and their successors through the pass.
438 SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
439#endif
440
441 /// Get block profile count or frequency according to UseProfileCount.
442 /// The return value is used to model tail duplication cost.
443 BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
444 if (UseProfileCount) {
445 auto Count = MBFI->getBlockProfileCount(BB);
446 if (Count)
447 return BlockFrequency(*Count);
448 else
449 return BlockFrequency(0);
450 } else
451 return MBFI->getBlockFreq(BB);
452 }
453
454 /// Scale the DupThreshold according to basic block size.
455 BlockFrequency scaleThreshold(MachineBasicBlock *BB);
456 void initDupThreshold();
457
458 /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
459 /// if the count goes to 0, add them to the appropriate work list.
460 void markChainSuccessors(
461 const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
462 const BlockFilterSet *BlockFilter = nullptr);
463
464 /// Decrease the UnscheduledPredecessors count for a single block, and
465 /// if the count goes to 0, add them to the appropriate work list.
466 void markBlockSuccessors(
467 const BlockChain &Chain, const MachineBasicBlock *BB,
468 const MachineBasicBlock *LoopHeaderBB,
469 const BlockFilterSet *BlockFilter = nullptr);
470
472 collectViableSuccessors(
473 const MachineBasicBlock *BB, const BlockChain &Chain,
474 const BlockFilterSet *BlockFilter,
476 bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
477 BlockFilterSet *BlockFilter);
478 void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
480 BlockFilterSet *BlockFilter);
481 bool repeatedlyTailDuplicateBlock(
483 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
484 BlockFilterSet *BlockFilter,
485 MachineFunction::iterator &PrevUnplacedBlockIt,
486 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt);
487 bool
488 maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred,
489 BlockChain &Chain, BlockFilterSet *BlockFilter,
490 MachineFunction::iterator &PrevUnplacedBlockIt,
491 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
492 bool &DuplicatedToLPred);
493 bool hasBetterLayoutPredecessor(
494 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
495 const BlockChain &SuccChain, BranchProbability SuccProb,
496 BranchProbability RealSuccProb, const BlockChain &Chain,
497 const BlockFilterSet *BlockFilter);
498 BlockAndTailDupResult selectBestSuccessor(
499 const MachineBasicBlock *BB, const BlockChain &Chain,
500 const BlockFilterSet *BlockFilter);
501 MachineBasicBlock *selectBestCandidateBlock(
502 const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
504 getFirstUnplacedBlock(const BlockChain &PlacedChain,
505 MachineFunction::iterator &PrevUnplacedBlockIt);
507 getFirstUnplacedBlock(const BlockChain &PlacedChain,
508 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
509 const BlockFilterSet *BlockFilter);
510
511 /// Add a basic block to the work list if it is appropriate.
512 ///
513 /// If the optional parameter BlockFilter is provided, only MBB
514 /// present in the set will be added to the worklist. If nullptr
515 /// is provided, no filtering occurs.
516 void fillWorkLists(const MachineBasicBlock *MBB,
517 SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
518 const BlockFilterSet *BlockFilter);
519
520 void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
521 BlockFilterSet *BlockFilter = nullptr);
522 bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
523 const MachineBasicBlock *OldTop);
524 bool hasViableTopFallthrough(const MachineBasicBlock *Top,
525 const BlockFilterSet &LoopBlockSet);
526 BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
527 const BlockFilterSet &LoopBlockSet);
528 BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
529 const MachineBasicBlock *OldTop,
530 const MachineBasicBlock *ExitBB,
531 const BlockFilterSet &LoopBlockSet);
532 MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
533 const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
534 MachineBasicBlock *findBestLoopTop(
535 const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
536 MachineBasicBlock *findBestLoopExit(
537 const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
538 BlockFrequency &ExitFreq);
539 BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
540 void buildLoopChains(const MachineLoop &L);
541 void rotateLoop(
542 BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
543 BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
544 void rotateLoopWithProfile(
545 BlockChain &LoopChain, const MachineLoop &L,
546 const BlockFilterSet &LoopBlockSet);
547 void buildCFGChains();
548 void optimizeBranches();
549 void alignBlocks();
550 /// Returns true if a block should be tail-duplicated to increase fallthrough
551 /// opportunities.
552 bool shouldTailDuplicate(MachineBasicBlock *BB);
553 /// Check the edge frequencies to see if tail duplication will increase
554 /// fallthroughs.
555 bool isProfitableToTailDup(
556 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
557 BranchProbability QProb,
558 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
559
560 /// Check for a trellis layout.
561 bool isTrellis(const MachineBasicBlock *BB,
562 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
563 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
564
565 /// Get the best successor given a trellis layout.
566 BlockAndTailDupResult getBestTrellisSuccessor(
567 const MachineBasicBlock *BB,
568 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
569 BranchProbability AdjustedSumProb, const BlockChain &Chain,
570 const BlockFilterSet *BlockFilter);
571
572 /// Get the best pair of non-conflicting edges.
573 static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
574 const MachineBasicBlock *BB,
576
577 /// Returns true if a block can tail duplicate into all unplaced
578 /// predecessors. Filters based on loop.
579 bool canTailDuplicateUnplacedPreds(
580 const MachineBasicBlock *BB, MachineBasicBlock *Succ,
581 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
582
583 /// Find chains of triangles to tail-duplicate where a global analysis works,
584 /// but a local analysis would not find them.
585 void precomputeTriangleChains();
586
587 /// Apply a post-processing step optimizing block placement.
588 void applyExtTsp();
589
590 /// Modify the existing block placement in the function and adjust all jumps.
591 void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
592
593 /// Create a single CFG chain from the current block order.
594 void createCFGChainExtTsp();
595
596public:
597 static char ID; // Pass identification, replacement for typeid
598
599 MachineBlockPlacement() : MachineFunctionPass(ID) {
601 }
602
603 bool runOnMachineFunction(MachineFunction &F) override;
604
605 bool allowTailDupPlacement() const {
606 assert(F);
607 return TailDupPlacement && !F->getTarget().requiresStructuredCFG();
608 }
609
610 void getAnalysisUsage(AnalysisUsage &AU) const override {
619 }
620};
621
622} // end anonymous namespace
623
624char MachineBlockPlacement::ID = 0;
625
626char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID;
627
628INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
629 "Branch Probability Basic Block Placement", false, false)
635INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
636 "Branch Probability Basic Block Placement", false, false)
637
638#ifndef NDEBUG
639/// Helper to print the name of a MBB.
640///
641/// Only used by debug logging.
642static std::string getBlockName(const MachineBasicBlock *BB) {
643 std::string Result;
644 raw_string_ostream OS(Result);
645 OS << printMBBReference(*BB);
646 OS << " ('" << BB->getName() << "')";
647 OS.flush();
648 return Result;
649}
650#endif
651
652/// Mark a chain's successors as having one fewer preds.
653///
654/// When a chain is being merged into the "placed" chain, this routine will
655/// quickly walk the successors of each block in the chain and mark them as
656/// having one fewer active predecessor. It also adds any successors of this
657/// chain which reach the zero-predecessor state to the appropriate worklist.
658void MachineBlockPlacement::markChainSuccessors(
659 const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
660 const BlockFilterSet *BlockFilter) {
661 // Walk all the blocks in this chain, marking their successors as having
662 // a predecessor placed.
663 for (MachineBasicBlock *MBB : Chain) {
664 markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
665 }
666}
667
668/// Mark a single block's successors as having one fewer preds.
669///
670/// Under normal circumstances, this is only called by markChainSuccessors,
671/// but if a block that was to be placed is completely tail-duplicated away,
672/// and was duplicated into the chain end, we need to redo markBlockSuccessors
673/// for just that block.
674void MachineBlockPlacement::markBlockSuccessors(
675 const BlockChain &Chain, const MachineBasicBlock *MBB,
676 const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
677 // Add any successors for which this is the only un-placed in-loop
678 // predecessor to the worklist as a viable candidate for CFG-neutral
679 // placement. No subsequent placement of this block will violate the CFG
680 // shape, so we get to use heuristics to choose a favorable placement.
681 for (MachineBasicBlock *Succ : MBB->successors()) {
682 if (BlockFilter && !BlockFilter->count(Succ))
683 continue;
684 BlockChain &SuccChain = *BlockToChain[Succ];
685 // Disregard edges within a fixed chain, or edges to the loop header.
686 if (&Chain == &SuccChain || Succ == LoopHeaderBB)
687 continue;
688
689 // This is a cross-chain edge that is within the loop, so decrement the
690 // loop predecessor count of the destination chain.
691 if (SuccChain.UnscheduledPredecessors == 0 ||
692 --SuccChain.UnscheduledPredecessors > 0)
693 continue;
694
695 auto *NewBB = *SuccChain.begin();
696 if (NewBB->isEHPad())
697 EHPadWorkList.push_back(NewBB);
698 else
699 BlockWorkList.push_back(NewBB);
700 }
701}
702
703/// This helper function collects the set of successors of block
704/// \p BB that are allowed to be its layout successors, and return
705/// the total branch probability of edges from \p BB to those
706/// blocks.
707BranchProbability MachineBlockPlacement::collectViableSuccessors(
708 const MachineBasicBlock *BB, const BlockChain &Chain,
709 const BlockFilterSet *BlockFilter,
711 // Adjust edge probabilities by excluding edges pointing to blocks that is
712 // either not in BlockFilter or is already in the current chain. Consider the
713 // following CFG:
714 //
715 // --->A
716 // | / \
717 // | B C
718 // | \ / \
719 // ----D E
720 //
721 // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
722 // A->C is chosen as a fall-through, D won't be selected as a successor of C
723 // due to CFG constraint (the probability of C->D is not greater than
724 // HotProb to break topo-order). If we exclude E that is not in BlockFilter
725 // when calculating the probability of C->D, D will be selected and we
726 // will get A C D B as the layout of this loop.
727 auto AdjustedSumProb = BranchProbability::getOne();
728 for (MachineBasicBlock *Succ : BB->successors()) {
729 bool SkipSucc = false;
730 if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
731 SkipSucc = true;
732 } else {
733 BlockChain *SuccChain = BlockToChain[Succ];
734 if (SuccChain == &Chain) {
735 SkipSucc = true;
736 } else if (Succ != *SuccChain->begin()) {
737 LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
738 << " -> Mid chain!\n");
739 continue;
740 }
741 }
742 if (SkipSucc)
743 AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
744 else
745 Successors.push_back(Succ);
746 }
747
748 return AdjustedSumProb;
749}
750
751/// The helper function returns the branch probability that is adjusted
752/// or normalized over the new total \p AdjustedSumProb.
755 BranchProbability AdjustedSumProb) {
756 BranchProbability SuccProb;
757 uint32_t SuccProbN = OrigProb.getNumerator();
758 uint32_t SuccProbD = AdjustedSumProb.getNumerator();
759 if (SuccProbN >= SuccProbD)
760 SuccProb = BranchProbability::getOne();
761 else
762 SuccProb = BranchProbability(SuccProbN, SuccProbD);
763
764 return SuccProb;
765}
766
767/// Check if \p BB has exactly the successors in \p Successors.
768static bool
771 if (BB.succ_size() != Successors.size())
772 return false;
773 // We don't want to count self-loops
774 if (Successors.count(&BB))
775 return false;
776 for (MachineBasicBlock *Succ : BB.successors())
777 if (!Successors.count(Succ))
778 return false;
779 return true;
780}
781
782/// Check if a block should be tail duplicated to increase fallthrough
783/// opportunities.
784/// \p BB Block to check.
785bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
786 // Blocks with single successors don't create additional fallthrough
787 // opportunities. Don't duplicate them. TODO: When conditional exits are
788 // analyzable, allow them to be duplicated.
789 bool IsSimple = TailDup.isSimpleBB(BB);
790
791 if (BB->succ_size() == 1)
792 return false;
793 return TailDup.shouldTailDuplicate(IsSimple, *BB);
794}
795
796/// Compare 2 BlockFrequency's with a small penalty for \p A.
797/// In order to be conservative, we apply a X% penalty to account for
798/// increased icache pressure and static heuristics. For small frequencies
799/// we use only the numerators to improve accuracy. For simplicity, we assume the
800/// penalty is less than 100%
801/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
803 BlockFrequency EntryFreq) {
804 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
805 BlockFrequency Gain = A - B;
806 return (Gain / ThresholdProb) >= EntryFreq;
807}
808
809/// Check the edge frequencies to see if tail duplication will increase
810/// fallthroughs. It only makes sense to call this function when
811/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
812/// always locally profitable if we would have picked \p Succ without
813/// considering duplication.
814bool MachineBlockPlacement::isProfitableToTailDup(
815 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
816 BranchProbability QProb,
817 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
818 // We need to do a probability calculation to make sure this is profitable.
819 // First: does succ have a successor that post-dominates? This affects the
820 // calculation. The 2 relevant cases are:
821 // BB BB
822 // | \Qout | \Qout
823 // P| C |P C
824 // = C' = C'
825 // | /Qin | /Qin
826 // | / | /
827 // Succ Succ
828 // / \ | \ V
829 // U/ =V |U \
830 // / \ = D
831 // D E | /
832 // | /
833 // |/
834 // PDom
835 // '=' : Branch taken for that CFG edge
836 // In the second case, Placing Succ while duplicating it into C prevents the
837 // fallthrough of Succ into either D or PDom, because they now have C as an
838 // unplaced predecessor
839
840 // Start by figuring out which case we fall into
841 MachineBasicBlock *PDom = nullptr;
843 // Only scan the relevant successors
844 auto AdjustedSuccSumProb =
845 collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
846 BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
847 auto BBFreq = MBFI->getBlockFreq(BB);
848 auto SuccFreq = MBFI->getBlockFreq(Succ);
849 BlockFrequency P = BBFreq * PProb;
850 BlockFrequency Qout = BBFreq * QProb;
851 BlockFrequency EntryFreq = MBFI->getEntryFreq();
852 // If there are no more successors, it is profitable to copy, as it strictly
853 // increases fallthrough.
854 if (SuccSuccs.size() == 0)
855 return greaterWithBias(P, Qout, EntryFreq);
856
857 auto BestSuccSucc = BranchProbability::getZero();
858 // Find the PDom or the best Succ if no PDom exists.
859 for (MachineBasicBlock *SuccSucc : SuccSuccs) {
860 auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
861 if (Prob > BestSuccSucc)
862 BestSuccSucc = Prob;
863 if (PDom == nullptr)
864 if (MPDT->dominates(SuccSucc, Succ)) {
865 PDom = SuccSucc;
866 break;
867 }
868 }
869 // For the comparisons, we need to know Succ's best incoming edge that isn't
870 // from BB.
871 auto SuccBestPred = BlockFrequency(0);
872 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
873 if (SuccPred == Succ || SuccPred == BB
874 || BlockToChain[SuccPred] == &Chain
875 || (BlockFilter && !BlockFilter->count(SuccPred)))
876 continue;
877 auto Freq = MBFI->getBlockFreq(SuccPred)
878 * MBPI->getEdgeProbability(SuccPred, Succ);
879 if (Freq > SuccBestPred)
880 SuccBestPred = Freq;
881 }
882 // Qin is Succ's best unplaced incoming edge that isn't BB
883 BlockFrequency Qin = SuccBestPred;
884 // If it doesn't have a post-dominating successor, here is the calculation:
885 // BB BB
886 // | \Qout | \
887 // P| C | =
888 // = C' | C
889 // | /Qin | |
890 // | / | C' (+Succ)
891 // Succ Succ /|
892 // / \ | \/ |
893 // U/ =V | == |
894 // / \ | / \|
895 // D E D E
896 // '=' : Branch taken for that CFG edge
897 // Cost in the first case is: P + V
898 // For this calculation, we always assume P > Qout. If Qout > P
899 // The result of this function will be ignored at the caller.
900 // Let F = SuccFreq - Qin
901 // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
902
903 if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
904 BranchProbability UProb = BestSuccSucc;
905 BranchProbability VProb = AdjustedSuccSumProb - UProb;
906 BlockFrequency F = SuccFreq - Qin;
907 BlockFrequency V = SuccFreq * VProb;
908 BlockFrequency QinU = std::min(Qin, F) * UProb;
909 BlockFrequency BaseCost = P + V;
910 BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
911 return greaterWithBias(BaseCost, DupCost, EntryFreq);
912 }
913 BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
914 BranchProbability VProb = AdjustedSuccSumProb - UProb;
915 BlockFrequency U = SuccFreq * UProb;
916 BlockFrequency V = SuccFreq * VProb;
917 BlockFrequency F = SuccFreq - Qin;
918 // If there is a post-dominating successor, here is the calculation:
919 // BB BB BB BB
920 // | \Qout | \ | \Qout | \
921 // |P C | = |P C | =
922 // = C' |P C = C' |P C
923 // | /Qin | | | /Qin | |
924 // | / | C' (+Succ) | / | C' (+Succ)
925 // Succ Succ /| Succ Succ /|
926 // | \ V | \/ | | \ V | \/ |
927 // |U \ |U /\ =? |U = |U /\ |
928 // = D = = =?| | D | = =|
929 // | / |/ D | / |/ D
930 // | / | / | = | /
931 // |/ | / |/ | =
932 // Dom Dom Dom Dom
933 // '=' : Branch taken for that CFG edge
934 // The cost for taken branches in the first case is P + U
935 // Let F = SuccFreq - Qin
936 // The cost in the second case (assuming independence), given the layout:
937 // BB, Succ, (C+Succ), D, Dom or the layout:
938 // BB, Succ, D, Dom, (C+Succ)
939 // is Qout + max(F, Qin) * U + min(F, Qin)
940 // compare P + U vs Qout + P * U + Qin.
941 //
942 // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
943 //
944 // For the 3rd case, the cost is P + 2 * V
945 // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
946 // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
947 if (UProb > AdjustedSuccSumProb / 2 &&
948 !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
949 Chain, BlockFilter))
950 // Cases 3 & 4
951 return greaterWithBias(
952 (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
953 EntryFreq);
954 // Cases 1 & 2
955 return greaterWithBias((P + U),
956 (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
957 std::max(Qin, F) * UProb),
958 EntryFreq);
959}
960
961/// Check for a trellis layout. \p BB is the upper part of a trellis if its
962/// successors form the lower part of a trellis. A successor set S forms the
963/// lower part of a trellis if all of the predecessors of S are either in S or
964/// have all of S as successors. We ignore trellises where BB doesn't have 2
965/// successors because for fewer than 2, it's trivial, and for 3 or greater they
966/// are very uncommon and complex to compute optimally. Allowing edges within S
967/// is not strictly a trellis, but the same algorithm works, so we allow it.
968bool MachineBlockPlacement::isTrellis(
969 const MachineBasicBlock *BB,
970 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
971 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
972 // Technically BB could form a trellis with branching factor higher than 2.
973 // But that's extremely uncommon.
974 if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
975 return false;
976
978 BB->succ_end());
979 // To avoid reviewing the same predecessors twice.
981
982 for (MachineBasicBlock *Succ : ViableSuccs) {
983 int PredCount = 0;
984 for (auto *SuccPred : Succ->predecessors()) {
985 // Allow triangle successors, but don't count them.
986 if (Successors.count(SuccPred)) {
987 // Make sure that it is actually a triangle.
988 for (MachineBasicBlock *CheckSucc : SuccPred->successors())
989 if (!Successors.count(CheckSucc))
990 return false;
991 continue;
992 }
993 const BlockChain *PredChain = BlockToChain[SuccPred];
994 if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
995 PredChain == &Chain || PredChain == BlockToChain[Succ])
996 continue;
997 ++PredCount;
998 // Perform the successor check only once.
999 if (!SeenPreds.insert(SuccPred).second)
1000 continue;
1001 if (!hasSameSuccessors(*SuccPred, Successors))
1002 return false;
1003 }
1004 // If one of the successors has only BB as a predecessor, it is not a
1005 // trellis.
1006 if (PredCount < 1)
1007 return false;
1008 }
1009 return true;
1010}
1011
1012/// Pick the highest total weight pair of edges that can both be laid out.
1013/// The edges in \p Edges[0] are assumed to have a different destination than
1014/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
1015/// the individual highest weight edges to the 2 different destinations, or in
1016/// case of a conflict, one of them should be replaced with a 2nd best edge.
1017std::pair<MachineBlockPlacement::WeightedEdge,
1018 MachineBlockPlacement::WeightedEdge>
1019MachineBlockPlacement::getBestNonConflictingEdges(
1020 const MachineBasicBlock *BB,
1022 Edges) {
1023 // Sort the edges, and then for each successor, find the best incoming
1024 // predecessor. If the best incoming predecessors aren't the same,
1025 // then that is clearly the best layout. If there is a conflict, one of the
1026 // successors will have to fallthrough from the second best predecessor. We
1027 // compare which combination is better overall.
1028
1029 // Sort for highest frequency.
1030 auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
1031
1032 llvm::stable_sort(Edges[0], Cmp);
1033 llvm::stable_sort(Edges[1], Cmp);
1034 auto BestA = Edges[0].begin();
1035 auto BestB = Edges[1].begin();
1036 // Arrange for the correct answer to be in BestA and BestB
1037 // If the 2 best edges don't conflict, the answer is already there.
1038 if (BestA->Src == BestB->Src) {
1039 // Compare the total fallthrough of (Best + Second Best) for both pairs
1040 auto SecondBestA = std::next(BestA);
1041 auto SecondBestB = std::next(BestB);
1042 BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
1043 BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
1044 if (BestAScore < BestBScore)
1045 BestA = SecondBestA;
1046 else
1047 BestB = SecondBestB;
1048 }
1049 // Arrange for the BB edge to be in BestA if it exists.
1050 if (BestB->Src == BB)
1051 std::swap(BestA, BestB);
1052 return std::make_pair(*BestA, *BestB);
1053}
1054
1055/// Get the best successor from \p BB based on \p BB being part of a trellis.
1056/// We only handle trellises with 2 successors, so the algorithm is
1057/// straightforward: Find the best pair of edges that don't conflict. We find
1058/// the best incoming edge for each successor in the trellis. If those conflict,
1059/// we consider which of them should be replaced with the second best.
1060/// Upon return the two best edges will be in \p BestEdges. If one of the edges
1061/// comes from \p BB, it will be in \p BestEdges[0]
1062MachineBlockPlacement::BlockAndTailDupResult
1063MachineBlockPlacement::getBestTrellisSuccessor(
1064 const MachineBasicBlock *BB,
1065 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1066 BranchProbability AdjustedSumProb, const BlockChain &Chain,
1067 const BlockFilterSet *BlockFilter) {
1068
1069 BlockAndTailDupResult Result = {nullptr, false};
1071 BB->succ_end());
1072
1073 // We assume size 2 because it's common. For general n, we would have to do
1074 // the Hungarian algorithm, but it's not worth the complexity because more
1075 // than 2 successors is fairly uncommon, and a trellis even more so.
1076 if (Successors.size() != 2 || ViableSuccs.size() != 2)
1077 return Result;
1078
1079 // Collect the edge frequencies of all edges that form the trellis.
1081 int SuccIndex = 0;
1082 for (auto *Succ : ViableSuccs) {
1083 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1084 // Skip any placed predecessors that are not BB
1085 if (SuccPred != BB)
1086 if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
1087 BlockToChain[SuccPred] == &Chain ||
1088 BlockToChain[SuccPred] == BlockToChain[Succ])
1089 continue;
1090 BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1091 MBPI->getEdgeProbability(SuccPred, Succ);
1092 Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1093 }
1094 ++SuccIndex;
1095 }
1096
1097 // Pick the best combination of 2 edges from all the edges in the trellis.
1098 WeightedEdge BestA, BestB;
1099 std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1100
1101 if (BestA.Src != BB) {
1102 // If we have a trellis, and BB doesn't have the best fallthrough edges,
1103 // we shouldn't choose any successor. We've already looked and there's a
1104 // better fallthrough edge for all the successors.
1105 LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1106 return Result;
1107 }
1108
1109 // Did we pick the triangle edge? If tail-duplication is profitable, do
1110 // that instead. Otherwise merge the triangle edge now while we know it is
1111 // optimal.
1112 if (BestA.Dest == BestB.Src) {
1113 // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1114 // would be better.
1115 MachineBasicBlock *Succ1 = BestA.Dest;
1116 MachineBasicBlock *Succ2 = BestB.Dest;
1117 // Check to see if tail-duplication would be profitable.
1118 if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
1119 canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1120 isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1121 Chain, BlockFilter)) {
1123 MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1124 dbgs() << " Selected: " << getBlockName(Succ2)
1125 << ", probability: " << Succ2Prob
1126 << " (Tail Duplicate)\n");
1127 Result.BB = Succ2;
1128 Result.ShouldTailDup = true;
1129 return Result;
1130 }
1131 }
1132 // We have already computed the optimal edge for the other side of the
1133 // trellis.
1134 ComputedEdges[BestB.Src] = { BestB.Dest, false };
1135
1136 auto TrellisSucc = BestA.Dest;
1138 MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1139 dbgs() << " Selected: " << getBlockName(TrellisSucc)
1140 << ", probability: " << SuccProb << " (Trellis)\n");
1141 Result.BB = TrellisSucc;
1142 return Result;
1143}
1144
1145/// When the option allowTailDupPlacement() is on, this method checks if the
1146/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1147/// into all of its unplaced, unfiltered predecessors, that are not BB.
1148bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1149 const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1150 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1151 if (!shouldTailDuplicate(Succ))
1152 return false;
1153
1154 // The result of canTailDuplicate.
1155 bool Duplicate = true;
1156 // Number of possible duplication.
1157 unsigned int NumDup = 0;
1158
1159 // For CFG checking.
1161 BB->succ_end());
1162 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1163 // Make sure all unplaced and unfiltered predecessors can be
1164 // tail-duplicated into.
1165 // Skip any blocks that are already placed or not in this loop.
1166 if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
1167 || (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
1168 continue;
1169 if (!TailDup.canTailDuplicate(Succ, Pred)) {
1170 if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1171 // This will result in a trellis after tail duplication, so we don't
1172 // need to copy Succ into this predecessor. In the presence
1173 // of a trellis tail duplication can continue to be profitable.
1174 // For example:
1175 // A A
1176 // |\ |\
1177 // | \ | \
1178 // | C | C+BB
1179 // | / | |
1180 // |/ | |
1181 // BB => BB |
1182 // |\ |\/|
1183 // | \ |/\|
1184 // | D | D
1185 // | / | /
1186 // |/ |/
1187 // Succ Succ
1188 //
1189 // After BB was duplicated into C, the layout looks like the one on the
1190 // right. BB and C now have the same successors. When considering
1191 // whether Succ can be duplicated into all its unplaced predecessors, we
1192 // ignore C.
1193 // We can do this because C already has a profitable fallthrough, namely
1194 // D. TODO(iteratee): ignore sufficiently cold predecessors for
1195 // duplication and for this test.
1196 //
1197 // This allows trellises to be laid out in 2 separate chains
1198 // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1199 // because it allows the creation of 2 fallthrough paths with links
1200 // between them, and we correctly identify the best layout for these
1201 // CFGs. We want to extend trellises that the user created in addition
1202 // to trellises created by tail-duplication, so we just look for the
1203 // CFG.
1204 continue;
1205 Duplicate = false;
1206 continue;
1207 }
1208 NumDup++;
1209 }
1210
1211 // No possible duplication in current filter set.
1212 if (NumDup == 0)
1213 return false;
1214
1215 // If profile information is available, findDuplicateCandidates can do more
1216 // precise benefit analysis.
1217 if (F->getFunction().hasProfileData())
1218 return true;
1219
1220 // This is mainly for function exit BB.
1221 // The integrated tail duplication is really designed for increasing
1222 // fallthrough from predecessors from Succ to its successors. We may need
1223 // other machanism to handle different cases.
1224 if (Succ->succ_empty())
1225 return true;
1226
1227 // Plus the already placed predecessor.
1228 NumDup++;
1229
1230 // If the duplication candidate has more unplaced predecessors than
1231 // successors, the extra duplication can't bring more fallthrough.
1232 //
1233 // Pred1 Pred2 Pred3
1234 // \ | /
1235 // \ | /
1236 // \ | /
1237 // Dup
1238 // / \
1239 // / \
1240 // Succ1 Succ2
1241 //
1242 // In this example Dup has 2 successors and 3 predecessors, duplication of Dup
1243 // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
1244 // but the duplication into Pred3 can't increase fallthrough.
1245 //
1246 // A small number of extra duplication may not hurt too much. We need a better
1247 // heuristic to handle it.
1248 if ((NumDup > Succ->succ_size()) || !Duplicate)
1249 return false;
1250
1251 return true;
1252}
1253
1254/// Find chains of triangles where we believe it would be profitable to
1255/// tail-duplicate them all, but a local analysis would not find them.
1256/// There are 3 ways this can be profitable:
1257/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1258/// longer chains)
1259/// 2) The chains are statically correlated. Branch probabilities have a very
1260/// U-shaped distribution.
1261/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1262/// If the branches in a chain are likely to be from the same side of the
1263/// distribution as their predecessor, but are independent at runtime, this
1264/// transformation is profitable. (Because the cost of being wrong is a small
1265/// fixed cost, unlike the standard triangle layout where the cost of being
1266/// wrong scales with the # of triangles.)
1267/// 3) The chains are dynamically correlated. If the probability that a previous
1268/// branch was taken positively influences whether the next branch will be
1269/// taken
1270/// We believe that 2 and 3 are common enough to justify the small margin in 1.
1271void MachineBlockPlacement::precomputeTriangleChains() {
1272 struct TriangleChain {
1273 std::vector<MachineBasicBlock *> Edges;
1274
1275 TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1276 : Edges({src, dst}) {}
1277
1278 void append(MachineBasicBlock *dst) {
1279 assert(getKey()->isSuccessor(dst) &&
1280 "Attempting to append a block that is not a successor.");
1281 Edges.push_back(dst);
1282 }
1283
1284 unsigned count() const { return Edges.size() - 1; }
1285
1286 MachineBasicBlock *getKey() const {
1287 return Edges.back();
1288 }
1289 };
1290
1291 if (TriangleChainCount == 0)
1292 return;
1293
1294 LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1295 // Map from last block to the chain that contains it. This allows us to extend
1296 // chains as we find new triangles.
1298 for (MachineBasicBlock &BB : *F) {
1299 // If BB doesn't have 2 successors, it doesn't start a triangle.
1300 if (BB.succ_size() != 2)
1301 continue;
1302 MachineBasicBlock *PDom = nullptr;
1303 for (MachineBasicBlock *Succ : BB.successors()) {
1304 if (!MPDT->dominates(Succ, &BB))
1305 continue;
1306 PDom = Succ;
1307 break;
1308 }
1309 // If BB doesn't have a post-dominating successor, it doesn't form a
1310 // triangle.
1311 if (PDom == nullptr)
1312 continue;
1313 // If PDom has a hint that it is low probability, skip this triangle.
1314 if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1315 continue;
1316 // If PDom isn't eligible for duplication, this isn't the kind of triangle
1317 // we're looking for.
1318 if (!shouldTailDuplicate(PDom))
1319 continue;
1320 bool CanTailDuplicate = true;
1321 // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1322 // isn't the kind of triangle we're looking for.
1323 for (MachineBasicBlock* Pred : PDom->predecessors()) {
1324 if (Pred == &BB)
1325 continue;
1326 if (!TailDup.canTailDuplicate(PDom, Pred)) {
1327 CanTailDuplicate = false;
1328 break;
1329 }
1330 }
1331 // If we can't tail-duplicate PDom to its predecessors, then skip this
1332 // triangle.
1333 if (!CanTailDuplicate)
1334 continue;
1335
1336 // Now we have an interesting triangle. Insert it if it's not part of an
1337 // existing chain.
1338 // Note: This cannot be replaced with a call insert() or emplace() because
1339 // the find key is BB, but the insert/emplace key is PDom.
1340 auto Found = TriangleChainMap.find(&BB);
1341 // If it is, remove the chain from the map, grow it, and put it back in the
1342 // map with the end as the new key.
1343 if (Found != TriangleChainMap.end()) {
1344 TriangleChain Chain = std::move(Found->second);
1345 TriangleChainMap.erase(Found);
1346 Chain.append(PDom);
1347 TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1348 } else {
1349 auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1350 assert(InsertResult.second && "Block seen twice.");
1351 (void)InsertResult;
1352 }
1353 }
1354
1355 // Iterating over a DenseMap is safe here, because the only thing in the body
1356 // of the loop is inserting into another DenseMap (ComputedEdges).
1357 // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1358 for (auto &ChainPair : TriangleChainMap) {
1359 TriangleChain &Chain = ChainPair.second;
1360 // Benchmarking has shown that due to branch correlation duplicating 2 or
1361 // more triangles is profitable, despite the calculations assuming
1362 // independence.
1363 if (Chain.count() < TriangleChainCount)
1364 continue;
1365 MachineBasicBlock *dst = Chain.Edges.back();
1366 Chain.Edges.pop_back();
1367 for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1368 LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1369 << getBlockName(dst)
1370 << " as pre-computed based on triangles.\n");
1371
1372 auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1373 assert(InsertResult.second && "Block seen twice.");
1374 (void)InsertResult;
1375
1376 dst = src;
1377 }
1378 }
1379}
1380
1381// When profile is not present, return the StaticLikelyProb.
1382// When profile is available, we need to handle the triangle-shape CFG.
1384 const MachineBasicBlock *BB) {
1385 if (!BB->getParent()->getFunction().hasProfileData())
1387 if (BB->succ_size() == 2) {
1388 const MachineBasicBlock *Succ1 = *BB->succ_begin();
1389 const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1390 if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1391 /* See case 1 below for the cost analysis. For BB->Succ to
1392 * be taken with smaller cost, the following needs to hold:
1393 * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1394 * So the threshold T in the calculation below
1395 * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1396 * So T / (1 - T) = 2, Yielding T = 2/3
1397 * Also adding user specified branch bias, we have
1398 * T = (2/3)*(ProfileLikelyProb/50)
1399 * = (2*ProfileLikelyProb)/150)
1400 */
1401 return BranchProbability(2 * ProfileLikelyProb, 150);
1402 }
1403 }
1405}
1406
1407/// Checks to see if the layout candidate block \p Succ has a better layout
1408/// predecessor than \c BB. If yes, returns true.
1409/// \p SuccProb: The probability adjusted for only remaining blocks.
1410/// Only used for logging
1411/// \p RealSuccProb: The un-adjusted probability.
1412/// \p Chain: The chain that BB belongs to and Succ is being considered for.
1413/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1414/// considered
1415bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1416 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1417 const BlockChain &SuccChain, BranchProbability SuccProb,
1418 BranchProbability RealSuccProb, const BlockChain &Chain,
1419 const BlockFilterSet *BlockFilter) {
1420
1421 // There isn't a better layout when there are no unscheduled predecessors.
1422 if (SuccChain.UnscheduledPredecessors == 0)
1423 return false;
1424
1425 // There are two basic scenarios here:
1426 // -------------------------------------
1427 // Case 1: triangular shape CFG (if-then):
1428 // BB
1429 // | \
1430 // | \
1431 // | Pred
1432 // | /
1433 // Succ
1434 // In this case, we are evaluating whether to select edge -> Succ, e.g.
1435 // set Succ as the layout successor of BB. Picking Succ as BB's
1436 // successor breaks the CFG constraints (FIXME: define these constraints).
1437 // With this layout, Pred BB
1438 // is forced to be outlined, so the overall cost will be cost of the
1439 // branch taken from BB to Pred, plus the cost of back taken branch
1440 // from Pred to Succ, as well as the additional cost associated
1441 // with the needed unconditional jump instruction from Pred To Succ.
1442
1443 // The cost of the topological order layout is the taken branch cost
1444 // from BB to Succ, so to make BB->Succ a viable candidate, the following
1445 // must hold:
1446 // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1447 // < freq(BB->Succ) * taken_branch_cost.
1448 // Ignoring unconditional jump cost, we get
1449 // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1450 // prob(BB->Succ) > 2 * prob(BB->Pred)
1451 //
1452 // When real profile data is available, we can precisely compute the
1453 // probability threshold that is needed for edge BB->Succ to be considered.
1454 // Without profile data, the heuristic requires the branch bias to be
1455 // a lot larger to make sure the signal is very strong (e.g. 80% default).
1456 // -----------------------------------------------------------------
1457 // Case 2: diamond like CFG (if-then-else):
1458 // S
1459 // / \
1460 // | \
1461 // BB Pred
1462 // \ /
1463 // Succ
1464 // ..
1465 //
1466 // The current block is BB and edge BB->Succ is now being evaluated.
1467 // Note that edge S->BB was previously already selected because
1468 // prob(S->BB) > prob(S->Pred).
1469 // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1470 // choose Pred, we will have a topological ordering as shown on the left
1471 // in the picture below. If we choose Succ, we have the solution as shown
1472 // on the right:
1473 //
1474 // topo-order:
1475 //
1476 // S----- ---S
1477 // | | | |
1478 // ---BB | | BB
1479 // | | | |
1480 // | Pred-- | Succ--
1481 // | | | |
1482 // ---Succ ---Pred--
1483 //
1484 // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1485 // = freq(S->Pred) + freq(S->BB)
1486 //
1487 // If we have profile data (i.e, branch probabilities can be trusted), the
1488 // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1489 // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1490 // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1491 // means the cost of topological order is greater.
1492 // When profile data is not available, however, we need to be more
1493 // conservative. If the branch prediction is wrong, breaking the topo-order
1494 // will actually yield a layout with large cost. For this reason, we need
1495 // strong biased branch at block S with Prob(S->BB) in order to select
1496 // BB->Succ. This is equivalent to looking the CFG backward with backward
1497 // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1498 // profile data).
1499 // --------------------------------------------------------------------------
1500 // Case 3: forked diamond
1501 // S
1502 // / \
1503 // / \
1504 // BB Pred
1505 // | \ / |
1506 // | \ / |
1507 // | X |
1508 // | / \ |
1509 // | / \ |
1510 // S1 S2
1511 //
1512 // The current block is BB and edge BB->S1 is now being evaluated.
1513 // As above S->BB was already selected because
1514 // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1515 //
1516 // topo-order:
1517 //
1518 // S-------| ---S
1519 // | | | |
1520 // ---BB | | BB
1521 // | | | |
1522 // | Pred----| | S1----
1523 // | | | |
1524 // --(S1 or S2) ---Pred--
1525 // |
1526 // S2
1527 //
1528 // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1529 // + min(freq(Pred->S1), freq(Pred->S2))
1530 // Non-topo-order cost:
1531 // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1532 // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1533 // is 0. Then the non topo layout is better when
1534 // freq(S->Pred) < freq(BB->S1).
1535 // This is exactly what is checked below.
1536 // Note there are other shapes that apply (Pred may not be a single block,
1537 // but they all fit this general pattern.)
1539
1540 // Make sure that a hot successor doesn't have a globally more
1541 // important predecessor.
1542 BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1543 bool BadCFGConflict = false;
1544
1545 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1546 BlockChain *PredChain = BlockToChain[Pred];
1547 if (Pred == Succ || PredChain == &SuccChain ||
1548 (BlockFilter && !BlockFilter->count(Pred)) ||
1549 PredChain == &Chain || Pred != *std::prev(PredChain->end()) ||
1550 // This check is redundant except for look ahead. This function is
1551 // called for lookahead by isProfitableToTailDup when BB hasn't been
1552 // placed yet.
1553 (Pred == BB))
1554 continue;
1555 // Do backward checking.
1556 // For all cases above, we need a backward checking to filter out edges that
1557 // are not 'strongly' biased.
1558 // BB Pred
1559 // \ /
1560 // Succ
1561 // We select edge BB->Succ if
1562 // freq(BB->Succ) > freq(Succ) * HotProb
1563 // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1564 // HotProb
1565 // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1566 // Case 1 is covered too, because the first equation reduces to:
1567 // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1568 BlockFrequency PredEdgeFreq =
1569 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1570 if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1571 BadCFGConflict = true;
1572 break;
1573 }
1574 }
1575
1576 if (BadCFGConflict) {
1577 LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1578 << SuccProb << " (prob) (non-cold CFG conflict)\n");
1579 return true;
1580 }
1581
1582 return false;
1583}
1584
1585/// Select the best successor for a block.
1586///
1587/// This looks across all successors of a particular block and attempts to
1588/// select the "best" one to be the layout successor. It only considers direct
1589/// successors which also pass the block filter. It will attempt to avoid
1590/// breaking CFG structure, but cave and break such structures in the case of
1591/// very hot successor edges.
1592///
1593/// \returns The best successor block found, or null if none are viable, along
1594/// with a boolean indicating if tail duplication is necessary.
1595MachineBlockPlacement::BlockAndTailDupResult
1596MachineBlockPlacement::selectBestSuccessor(
1597 const MachineBasicBlock *BB, const BlockChain &Chain,
1598 const BlockFilterSet *BlockFilter) {
1599 const BranchProbability HotProb(StaticLikelyProb, 100);
1600
1601 BlockAndTailDupResult BestSucc = { nullptr, false };
1602 auto BestProb = BranchProbability::getZero();
1603
1605 auto AdjustedSumProb =
1606 collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1607
1608 LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1609 << "\n");
1610
1611 // if we already precomputed the best successor for BB, return that if still
1612 // applicable.
1613 auto FoundEdge = ComputedEdges.find(BB);
1614 if (FoundEdge != ComputedEdges.end()) {
1615 MachineBasicBlock *Succ = FoundEdge->second.BB;
1616 ComputedEdges.erase(FoundEdge);
1617 BlockChain *SuccChain = BlockToChain[Succ];
1618 if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1619 SuccChain != &Chain && Succ == *SuccChain->begin())
1620 return FoundEdge->second;
1621 }
1622
1623 // if BB is part of a trellis, Use the trellis to determine the optimal
1624 // fallthrough edges
1625 if (isTrellis(BB, Successors, Chain, BlockFilter))
1626 return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1627 BlockFilter);
1628
1629 // For blocks with CFG violations, we may be able to lay them out anyway with
1630 // tail-duplication. We keep this vector so we can perform the probability
1631 // calculations the minimum number of times.
1633 DupCandidates;
1634 for (MachineBasicBlock *Succ : Successors) {
1635 auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1636 BranchProbability SuccProb =
1637 getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1638
1639 BlockChain &SuccChain = *BlockToChain[Succ];
1640 // Skip the edge \c BB->Succ if block \c Succ has a better layout
1641 // predecessor that yields lower global cost.
1642 if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1643 Chain, BlockFilter)) {
1644 // If tail duplication would make Succ profitable, place it.
1645 if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1646 DupCandidates.emplace_back(SuccProb, Succ);
1647 continue;
1648 }
1649
1650 LLVM_DEBUG(
1651 dbgs() << " Candidate: " << getBlockName(Succ)
1652 << ", probability: " << SuccProb
1653 << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1654 << "\n");
1655
1656 if (BestSucc.BB && BestProb >= SuccProb) {
1657 LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1658 continue;
1659 }
1660
1661 LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1662 BestSucc.BB = Succ;
1663 BestProb = SuccProb;
1664 }
1665 // Handle the tail duplication candidates in order of decreasing probability.
1666 // Stop at the first one that is profitable. Also stop if they are less
1667 // profitable than BestSucc. Position is important because we preserve it and
1668 // prefer first best match. Here we aren't comparing in order, so we capture
1669 // the position instead.
1670 llvm::stable_sort(DupCandidates,
1671 [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1672 std::tuple<BranchProbability, MachineBasicBlock *> R) {
1673 return std::get<0>(L) > std::get<0>(R);
1674 });
1675 for (auto &Tup : DupCandidates) {
1676 BranchProbability DupProb;
1677 MachineBasicBlock *Succ;
1678 std::tie(DupProb, Succ) = Tup;
1679 if (DupProb < BestProb)
1680 break;
1681 if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1682 && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1683 LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1684 << ", probability: " << DupProb
1685 << " (Tail Duplicate)\n");
1686 BestSucc.BB = Succ;
1687 BestSucc.ShouldTailDup = true;
1688 break;
1689 }
1690 }
1691
1692 if (BestSucc.BB)
1693 LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1694
1695 return BestSucc;
1696}
1697
1698/// Select the best block from a worklist.
1699///
1700/// This looks through the provided worklist as a list of candidate basic
1701/// blocks and select the most profitable one to place. The definition of
1702/// profitable only really makes sense in the context of a loop. This returns
1703/// the most frequently visited block in the worklist, which in the case of
1704/// a loop, is the one most desirable to be physically close to the rest of the
1705/// loop body in order to improve i-cache behavior.
1706///
1707/// \returns The best block found, or null if none are viable.
1708MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1709 const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1710 // Once we need to walk the worklist looking for a candidate, cleanup the
1711 // worklist of already placed entries.
1712 // FIXME: If this shows up on profiles, it could be folded (at the cost of
1713 // some code complexity) into the loop below.
1714 llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) {
1715 return BlockToChain.lookup(BB) == &Chain;
1716 });
1717
1718 if (WorkList.empty())
1719 return nullptr;
1720
1721 bool IsEHPad = WorkList[0]->isEHPad();
1722
1723 MachineBasicBlock *BestBlock = nullptr;
1724 BlockFrequency BestFreq;
1725 for (MachineBasicBlock *MBB : WorkList) {
1726 assert(MBB->isEHPad() == IsEHPad &&
1727 "EHPad mismatch between block and work list.");
1728
1729 BlockChain &SuccChain = *BlockToChain[MBB];
1730 if (&SuccChain == &Chain)
1731 continue;
1732
1733 assert(SuccChain.UnscheduledPredecessors == 0 &&
1734 "Found CFG-violating block");
1735
1736 BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1737 LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> "
1738 << printBlockFreq(MBFI->getMBFI(), CandidateFreq)
1739 << " (freq)\n");
1740
1741 // For ehpad, we layout the least probable first as to avoid jumping back
1742 // from least probable landingpads to more probable ones.
1743 //
1744 // FIXME: Using probability is probably (!) not the best way to achieve
1745 // this. We should probably have a more principled approach to layout
1746 // cleanup code.
1747 //
1748 // The goal is to get:
1749 //
1750 // +--------------------------+
1751 // | V
1752 // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1753 //
1754 // Rather than:
1755 //
1756 // +-------------------------------------+
1757 // V |
1758 // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1759 if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1760 continue;
1761
1762 BestBlock = MBB;
1763 BestFreq = CandidateFreq;
1764 }
1765
1766 return BestBlock;
1767}
1768
1769/// Retrieve the first unplaced basic block in the entire function.
1770///
1771/// This routine is called when we are unable to use the CFG to walk through
1772/// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1773/// We walk through the function's blocks in order, starting from the
1774/// LastUnplacedBlockIt. We update this iterator on each call to avoid
1775/// re-scanning the entire sequence on repeated calls to this routine.
1776MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1777 const BlockChain &PlacedChain,
1778 MachineFunction::iterator &PrevUnplacedBlockIt) {
1779
1780 for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1781 ++I) {
1782 if (BlockToChain[&*I] != &PlacedChain) {
1783 PrevUnplacedBlockIt = I;
1784 // Now select the head of the chain to which the unplaced block belongs
1785 // as the block to place. This will force the entire chain to be placed,
1786 // and satisfies the requirements of merging chains.
1787 return *BlockToChain[&*I]->begin();
1788 }
1789 }
1790 return nullptr;
1791}
1792
1793/// Retrieve the first unplaced basic block among the blocks in BlockFilter.
1794///
1795/// This is similar to getFirstUnplacedBlock for the entire function, but since
1796/// the size of BlockFilter is typically far less than the number of blocks in
1797/// the entire function, iterating through the BlockFilter is more efficient.
1798/// When processing the entire funciton, using the version without BlockFilter
1799/// has a complexity of #(loops in function) * #(blocks in function), while this
1800/// version has a complexity of sum(#(loops in block) foreach block in function)
1801/// which is always smaller. For long function mostly sequential in structure,
1802/// the complexity is amortized to 1 * #(blocks in function).
1803MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1804 const BlockChain &PlacedChain,
1805 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
1806 const BlockFilterSet *BlockFilter) {
1807 assert(BlockFilter);
1808 for (; PrevUnplacedBlockInFilterIt != BlockFilter->end();
1809 ++PrevUnplacedBlockInFilterIt) {
1810 BlockChain *C = BlockToChain[*PrevUnplacedBlockInFilterIt];
1811 if (C != &PlacedChain) {
1812 return *C->begin();
1813 }
1814 }
1815 return nullptr;
1816}
1817
1818void MachineBlockPlacement::fillWorkLists(
1819 const MachineBasicBlock *MBB,
1820 SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1821 const BlockFilterSet *BlockFilter = nullptr) {
1822 BlockChain &Chain = *BlockToChain[MBB];
1823 if (!UpdatedPreds.insert(&Chain).second)
1824 return;
1825
1826 assert(
1827 Chain.UnscheduledPredecessors == 0 &&
1828 "Attempting to place block with unscheduled predecessors in worklist.");
1829 for (MachineBasicBlock *ChainBB : Chain) {
1830 assert(BlockToChain[ChainBB] == &Chain &&
1831 "Block in chain doesn't match BlockToChain map.");
1832 for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1833 if (BlockFilter && !BlockFilter->count(Pred))
1834 continue;
1835 if (BlockToChain[Pred] == &Chain)
1836 continue;
1837 ++Chain.UnscheduledPredecessors;
1838 }
1839 }
1840
1841 if (Chain.UnscheduledPredecessors != 0)
1842 return;
1843
1844 MachineBasicBlock *BB = *Chain.begin();
1845 if (BB->isEHPad())
1846 EHPadWorkList.push_back(BB);
1847 else
1848 BlockWorkList.push_back(BB);
1849}
1850
1851void MachineBlockPlacement::buildChain(
1852 const MachineBasicBlock *HeadBB, BlockChain &Chain,
1853 BlockFilterSet *BlockFilter) {
1854 assert(HeadBB && "BB must not be null.\n");
1855 assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1856 MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1857 BlockFilterSet::iterator PrevUnplacedBlockInFilterIt;
1858 if (BlockFilter)
1859 PrevUnplacedBlockInFilterIt = BlockFilter->begin();
1860
1861 const MachineBasicBlock *LoopHeaderBB = HeadBB;
1862 markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1863 MachineBasicBlock *BB = *std::prev(Chain.end());
1864 while (true) {
1865 assert(BB && "null block found at end of chain in loop.");
1866 assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1867 assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1868
1869
1870 // Look for the best viable successor if there is one to place immediately
1871 // after this block.
1872 auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1873 MachineBasicBlock* BestSucc = Result.BB;
1874 bool ShouldTailDup = Result.ShouldTailDup;
1875 if (allowTailDupPlacement())
1876 ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(BB, BestSucc,
1877 Chain,
1878 BlockFilter));
1879
1880 // If an immediate successor isn't available, look for the best viable
1881 // block among those we've identified as not violating the loop's CFG at
1882 // this point. This won't be a fallthrough, but it will increase locality.
1883 if (!BestSucc)
1884 BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1885 if (!BestSucc)
1886 BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1887
1888 if (!BestSucc) {
1889 if (BlockFilter)
1890 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockInFilterIt,
1891 BlockFilter);
1892 else
1893 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt);
1894 if (!BestSucc)
1895 break;
1896
1897 LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1898 "layout successor until the CFG reduces\n");
1899 }
1900
1901 // Placement may have changed tail duplication opportunities.
1902 // Check for that now.
1903 if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1904 repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1905 BlockFilter, PrevUnplacedBlockIt,
1906 PrevUnplacedBlockInFilterIt);
1907 // If the chosen successor was duplicated into BB, don't bother laying
1908 // it out, just go round the loop again with BB as the chain end.
1909 if (!BB->isSuccessor(BestSucc))
1910 continue;
1911 }
1912
1913 // Place this block, updating the datastructures to reflect its placement.
1914 BlockChain &SuccChain = *BlockToChain[BestSucc];
1915 // Zero out UnscheduledPredecessors for the successor we're about to merge in case
1916 // we selected a successor that didn't fit naturally into the CFG.
1917 SuccChain.UnscheduledPredecessors = 0;
1918 LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1919 << getBlockName(BestSucc) << "\n");
1920 markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1921 Chain.merge(BestSucc, &SuccChain);
1922 BB = *std::prev(Chain.end());
1923 }
1924
1925 LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1926 << getBlockName(*Chain.begin()) << "\n");
1927}
1928
1929// If bottom of block BB has only one successor OldTop, in most cases it is
1930// profitable to move it before OldTop, except the following case:
1931//
1932// -->OldTop<-
1933// | . |
1934// | . |
1935// | . |
1936// ---Pred |
1937// | |
1938// BB-----
1939//
1940// If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
1941// layout the other successor below it, so it can't reduce taken branch.
1942// In this case we keep its original layout.
1943bool
1944MachineBlockPlacement::canMoveBottomBlockToTop(
1945 const MachineBasicBlock *BottomBlock,
1946 const MachineBasicBlock *OldTop) {
1947 if (BottomBlock->pred_size() != 1)
1948 return true;
1949 MachineBasicBlock *Pred = *BottomBlock->pred_begin();
1950 if (Pred->succ_size() != 2)
1951 return true;
1952
1953 MachineBasicBlock *OtherBB = *Pred->succ_begin();
1954 if (OtherBB == BottomBlock)
1955 OtherBB = *Pred->succ_rbegin();
1956 if (OtherBB == OldTop)
1957 return false;
1958
1959 return true;
1960}
1961
1962// Find out the possible fall through frequence to the top of a loop.
1964MachineBlockPlacement::TopFallThroughFreq(
1965 const MachineBasicBlock *Top,
1966 const BlockFilterSet &LoopBlockSet) {
1967 BlockFrequency MaxFreq = BlockFrequency(0);
1968 for (MachineBasicBlock *Pred : Top->predecessors()) {
1969 BlockChain *PredChain = BlockToChain[Pred];
1970 if (!LoopBlockSet.count(Pred) &&
1971 (!PredChain || Pred == *std::prev(PredChain->end()))) {
1972 // Found a Pred block can be placed before Top.
1973 // Check if Top is the best successor of Pred.
1974 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
1975 bool TopOK = true;
1976 for (MachineBasicBlock *Succ : Pred->successors()) {
1977 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
1978 BlockChain *SuccChain = BlockToChain[Succ];
1979 // Check if Succ can be placed after Pred.
1980 // Succ should not be in any chain, or it is the head of some chain.
1981 if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
1982 (!SuccChain || Succ == *SuccChain->begin())) {
1983 TopOK = false;
1984 break;
1985 }
1986 }
1987 if (TopOK) {
1988 BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
1989 MBPI->getEdgeProbability(Pred, Top);
1990 if (EdgeFreq > MaxFreq)
1991 MaxFreq = EdgeFreq;
1992 }
1993 }
1994 }
1995 return MaxFreq;
1996}
1997
1998// Compute the fall through gains when move NewTop before OldTop.
1999//
2000// In following diagram, edges marked as "-" are reduced fallthrough, edges
2001// marked as "+" are increased fallthrough, this function computes
2002//
2003// SUM(increased fallthrough) - SUM(decreased fallthrough)
2004//
2005// |
2006// | -
2007// V
2008// --->OldTop
2009// | .
2010// | .
2011// +| . +
2012// | Pred --->
2013// | |-
2014// | V
2015// --- NewTop <---
2016// |-
2017// V
2018//
2020MachineBlockPlacement::FallThroughGains(
2021 const MachineBasicBlock *NewTop,
2022 const MachineBasicBlock *OldTop,
2023 const MachineBasicBlock *ExitBB,
2024 const BlockFilterSet &LoopBlockSet) {
2025 BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
2026 BlockFrequency FallThrough2Exit = BlockFrequency(0);
2027 if (ExitBB)
2028 FallThrough2Exit = MBFI->getBlockFreq(NewTop) *
2029 MBPI->getEdgeProbability(NewTop, ExitBB);
2030 BlockFrequency BackEdgeFreq = MBFI->getBlockFreq(NewTop) *
2031 MBPI->getEdgeProbability(NewTop, OldTop);
2032
2033 // Find the best Pred of NewTop.
2034 MachineBasicBlock *BestPred = nullptr;
2035 BlockFrequency FallThroughFromPred = BlockFrequency(0);
2036 for (MachineBasicBlock *Pred : NewTop->predecessors()) {
2037 if (!LoopBlockSet.count(Pred))
2038 continue;
2039 BlockChain *PredChain = BlockToChain[Pred];
2040 if (!PredChain || Pred == *std::prev(PredChain->end())) {
2041 BlockFrequency EdgeFreq =
2042 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, NewTop);
2043 if (EdgeFreq > FallThroughFromPred) {
2044 FallThroughFromPred = EdgeFreq;
2045 BestPred = Pred;
2046 }
2047 }
2048 }
2049
2050 // If NewTop is not placed after Pred, another successor can be placed
2051 // after Pred.
2052 BlockFrequency NewFreq = BlockFrequency(0);
2053 if (BestPred) {
2054 for (MachineBasicBlock *Succ : BestPred->successors()) {
2055 if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
2056 continue;
2057 if (ComputedEdges.contains(Succ))
2058 continue;
2059 BlockChain *SuccChain = BlockToChain[Succ];
2060 if ((SuccChain && (Succ != *SuccChain->begin())) ||
2061 (SuccChain == BlockToChain[BestPred]))
2062 continue;
2063 BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
2064 MBPI->getEdgeProbability(BestPred, Succ);
2065 if (EdgeFreq > NewFreq)
2066 NewFreq = EdgeFreq;
2067 }
2068 BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
2069 MBPI->getEdgeProbability(BestPred, NewTop);
2070 if (NewFreq > OrigEdgeFreq) {
2071 // If NewTop is not the best successor of Pred, then Pred doesn't
2072 // fallthrough to NewTop. So there is no FallThroughFromPred and
2073 // NewFreq.
2074 NewFreq = BlockFrequency(0);
2075 FallThroughFromPred = BlockFrequency(0);
2076 }
2077 }
2078
2080 BlockFrequency Gains = BackEdgeFreq + NewFreq;
2081 BlockFrequency Lost =
2082 FallThrough2Top + FallThrough2Exit + FallThroughFromPred;
2083 if (Gains > Lost)
2084 Result = Gains - Lost;
2085 return Result;
2086}
2087
2088/// Helper function of findBestLoopTop. Find the best loop top block
2089/// from predecessors of old top.
2090///
2091/// Look for a block which is strictly better than the old top for laying
2092/// out before the old top of the loop. This looks for only two patterns:
2093///
2094/// 1. a block has only one successor, the old loop top
2095///
2096/// Because such a block will always result in an unconditional jump,
2097/// rotating it in front of the old top is always profitable.
2098///
2099/// 2. a block has two successors, one is old top, another is exit
2100/// and it has more than one predecessors
2101///
2102/// If it is below one of its predecessors P, only P can fall through to
2103/// it, all other predecessors need a jump to it, and another conditional
2104/// jump to loop header. If it is moved before loop header, all its
2105/// predecessors jump to it, then fall through to loop header. So all its
2106/// predecessors except P can reduce one taken branch.
2107/// At the same time, move it before old top increases the taken branch
2108/// to loop exit block, so the reduced taken branch will be compared with
2109/// the increased taken branch to the loop exit block.
2111MachineBlockPlacement::findBestLoopTopHelper(
2112 MachineBasicBlock *OldTop,
2113 const MachineLoop &L,
2114 const BlockFilterSet &LoopBlockSet) {
2115 // Check that the header hasn't been fused with a preheader block due to
2116 // crazy branches. If it has, we need to start with the header at the top to
2117 // prevent pulling the preheader into the loop body.
2118 BlockChain &HeaderChain = *BlockToChain[OldTop];
2119 if (!LoopBlockSet.count(*HeaderChain.begin()))
2120 return OldTop;
2121 if (OldTop != *HeaderChain.begin())
2122 return OldTop;
2123
2124 LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
2125 << "\n");
2126
2127 BlockFrequency BestGains = BlockFrequency(0);
2128 MachineBasicBlock *BestPred = nullptr;
2129 for (MachineBasicBlock *Pred : OldTop->predecessors()) {
2130 if (!LoopBlockSet.count(Pred))
2131 continue;
2132 if (Pred == L.getHeader())
2133 continue;
2134 LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
2135 << Pred->succ_size() << " successors, "
2136 << printBlockFreq(MBFI->getMBFI(), *Pred) << " freq\n");
2137 if (Pred->succ_size() > 2)
2138 continue;
2139
2140 MachineBasicBlock *OtherBB = nullptr;
2141 if (Pred->succ_size() == 2) {
2142 OtherBB = *Pred->succ_begin();
2143 if (OtherBB == OldTop)
2144 OtherBB = *Pred->succ_rbegin();
2145 }
2146
2147 if (!canMoveBottomBlockToTop(Pred, OldTop))
2148 continue;
2149
2150 BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
2151 LoopBlockSet);
2152 if ((Gains > BlockFrequency(0)) &&
2153 (Gains > BestGains ||
2154 ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
2155 BestPred = Pred;
2156 BestGains = Gains;
2157 }
2158 }
2159
2160 // If no direct predecessor is fine, just use the loop header.
2161 if (!BestPred) {
2162 LLVM_DEBUG(dbgs() << " final top unchanged\n");
2163 return OldTop;
2164 }
2165
2166 // Walk backwards through any straight line of predecessors.
2167 while (BestPred->pred_size() == 1 &&
2168 (*BestPred->pred_begin())->succ_size() == 1 &&
2169 *BestPred->pred_begin() != L.getHeader())
2170 BestPred = *BestPred->pred_begin();
2171
2172 LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2173 return BestPred;
2174}
2175
2176/// Find the best loop top block for layout.
2177///
2178/// This function iteratively calls findBestLoopTopHelper, until no new better
2179/// BB can be found.
2181MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2182 const BlockFilterSet &LoopBlockSet) {
2183 // Placing the latch block before the header may introduce an extra branch
2184 // that skips this block the first time the loop is executed, which we want
2185 // to avoid when optimising for size.
2186 // FIXME: in theory there is a case that does not introduce a new branch,
2187 // i.e. when the layout predecessor does not fallthrough to the loop header.
2188 // In practice this never happens though: there always seems to be a preheader
2189 // that can fallthrough and that is also placed before the header.
2190 bool OptForSize = F->getFunction().hasOptSize() ||
2191 llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get());
2192 if (OptForSize)
2193 return L.getHeader();
2194
2195 MachineBasicBlock *OldTop = nullptr;
2196 MachineBasicBlock *NewTop = L.getHeader();
2197 while (NewTop != OldTop) {
2198 OldTop = NewTop;
2199 NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2200 if (NewTop != OldTop)
2201 ComputedEdges[NewTop] = { OldTop, false };
2202 }
2203 return NewTop;
2204}
2205
2206/// Find the best loop exiting block for layout.
2207///
2208/// This routine implements the logic to analyze the loop looking for the best
2209/// block to layout at the top of the loop. Typically this is done to maximize
2210/// fallthrough opportunities.
2212MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2213 const BlockFilterSet &LoopBlockSet,
2214 BlockFrequency &ExitFreq) {
2215 // We don't want to layout the loop linearly in all cases. If the loop header
2216 // is just a normal basic block in the loop, we want to look for what block
2217 // within the loop is the best one to layout at the top. However, if the loop
2218 // header has be pre-merged into a chain due to predecessors not having
2219 // analyzable branches, *and* the predecessor it is merged with is *not* part
2220 // of the loop, rotating the header into the middle of the loop will create
2221 // a non-contiguous range of blocks which is Very Bad. So start with the
2222 // header and only rotate if safe.
2223 BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2224 if (!LoopBlockSet.count(*HeaderChain.begin()))
2225 return nullptr;
2226
2227 BlockFrequency BestExitEdgeFreq;
2228 unsigned BestExitLoopDepth = 0;
2229 MachineBasicBlock *ExitingBB = nullptr;
2230 // If there are exits to outer loops, loop rotation can severely limit
2231 // fallthrough opportunities unless it selects such an exit. Keep a set of
2232 // blocks where rotating to exit with that block will reach an outer loop.
2233 SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2234
2235 LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2236 << getBlockName(L.getHeader()) << "\n");
2237 for (MachineBasicBlock *MBB : L.getBlocks()) {
2238 BlockChain &Chain = *BlockToChain[MBB];
2239 // Ensure that this block is at the end of a chain; otherwise it could be
2240 // mid-way through an inner loop or a successor of an unanalyzable branch.
2241 if (MBB != *std::prev(Chain.end()))
2242 continue;
2243
2244 // Now walk the successors. We need to establish whether this has a viable
2245 // exiting successor and whether it has a viable non-exiting successor.
2246 // We store the old exiting state and restore it if a viable looping
2247 // successor isn't found.
2248 MachineBasicBlock *OldExitingBB = ExitingBB;
2249 BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2250 bool HasLoopingSucc = false;
2251 for (MachineBasicBlock *Succ : MBB->successors()) {
2252 if (Succ->isEHPad())
2253 continue;
2254 if (Succ == MBB)
2255 continue;
2256 BlockChain &SuccChain = *BlockToChain[Succ];
2257 // Don't split chains, either this chain or the successor's chain.
2258 if (&Chain == &SuccChain) {
2259 LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2260 << getBlockName(Succ) << " (chain conflict)\n");
2261 continue;
2262 }
2263
2264 auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2265 if (LoopBlockSet.count(Succ)) {
2266 LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2267 << getBlockName(Succ) << " (" << SuccProb << ")\n");
2268 HasLoopingSucc = true;
2269 continue;
2270 }
2271
2272 unsigned SuccLoopDepth = 0;
2273 if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2274 SuccLoopDepth = ExitLoop->getLoopDepth();
2275 if (ExitLoop->contains(&L))
2276 BlocksExitingToOuterLoop.insert(MBB);
2277 }
2278
2279 BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2280 LLVM_DEBUG(
2281 dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2282 << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("
2283 << printBlockFreq(MBFI->getMBFI(), ExitEdgeFreq) << ")\n");
2284 // Note that we bias this toward an existing layout successor to retain
2285 // incoming order in the absence of better information. The exit must have
2286 // a frequency higher than the current exit before we consider breaking
2287 // the layout.
2288 BranchProbability Bias(100 - ExitBlockBias, 100);
2289 if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2290 ExitEdgeFreq > BestExitEdgeFreq ||
2291 (MBB->isLayoutSuccessor(Succ) &&
2292 !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2293 BestExitEdgeFreq = ExitEdgeFreq;
2294 ExitingBB = MBB;
2295 }
2296 }
2297
2298 if (!HasLoopingSucc) {
2299 // Restore the old exiting state, no viable looping successor was found.
2300 ExitingBB = OldExitingBB;
2301 BestExitEdgeFreq = OldBestExitEdgeFreq;
2302 }
2303 }
2304 // Without a candidate exiting block or with only a single block in the
2305 // loop, just use the loop header to layout the loop.
2306 if (!ExitingBB) {
2307 LLVM_DEBUG(
2308 dbgs() << " No other candidate exit blocks, using loop header\n");
2309 return nullptr;
2310 }
2311 if (L.getNumBlocks() == 1) {
2312 LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2313 return nullptr;
2314 }
2315
2316 // Also, if we have exit blocks which lead to outer loops but didn't select
2317 // one of them as the exiting block we are rotating toward, disable loop
2318 // rotation altogether.
2319 if (!BlocksExitingToOuterLoop.empty() &&
2320 !BlocksExitingToOuterLoop.count(ExitingBB))
2321 return nullptr;
2322
2323 LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2324 << "\n");
2325 ExitFreq = BestExitEdgeFreq;
2326 return ExitingBB;
2327}
2328
2329/// Check if there is a fallthrough to loop header Top.
2330///
2331/// 1. Look for a Pred that can be layout before Top.
2332/// 2. Check if Top is the most possible successor of Pred.
2333bool
2334MachineBlockPlacement::hasViableTopFallthrough(
2335 const MachineBasicBlock *Top,
2336 const BlockFilterSet &LoopBlockSet) {
2337 for (MachineBasicBlock *Pred : Top->predecessors()) {
2338 BlockChain *PredChain = BlockToChain[Pred];
2339 if (!LoopBlockSet.count(Pred) &&
2340 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2341 // Found a Pred block can be placed before Top.
2342 // Check if Top is the best successor of Pred.
2343 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2344 bool TopOK = true;
2345 for (MachineBasicBlock *Succ : Pred->successors()) {
2346 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2347 BlockChain *SuccChain = BlockToChain[Succ];
2348 // Check if Succ can be placed after Pred.
2349 // Succ should not be in any chain, or it is the head of some chain.
2350 if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2351 TopOK = false;
2352 break;
2353 }
2354 }
2355 if (TopOK)
2356 return true;
2357 }
2358 }
2359 return false;
2360}
2361
2362/// Attempt to rotate an exiting block to the bottom of the loop.
2363///
2364/// Once we have built a chain, try to rotate it to line up the hot exit block
2365/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2366/// branches. For example, if the loop has fallthrough into its header and out
2367/// of its bottom already, don't rotate it.
2368void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2369 const MachineBasicBlock *ExitingBB,
2370 BlockFrequency ExitFreq,
2371 const BlockFilterSet &LoopBlockSet) {
2372 if (!ExitingBB)
2373 return;
2374
2375 MachineBasicBlock *Top = *LoopChain.begin();
2376 MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2377
2378 // If ExitingBB is already the last one in a chain then nothing to do.
2379 if (Bottom == ExitingBB)
2380 return;
2381
2382 // The entry block should always be the first BB in a function.
2383 if (Top->isEntryBlock())
2384 return;
2385
2386 bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2387
2388 // If the header has viable fallthrough, check whether the current loop
2389 // bottom is a viable exiting block. If so, bail out as rotating will
2390 // introduce an unnecessary branch.
2391 if (ViableTopFallthrough) {
2392 for (MachineBasicBlock *Succ : Bottom->successors()) {
2393 BlockChain *SuccChain = BlockToChain[Succ];
2394 if (!LoopBlockSet.count(Succ) &&
2395 (!SuccChain || Succ == *SuccChain->begin()))
2396 return;
2397 }
2398
2399 // Rotate will destroy the top fallthrough, we need to ensure the new exit
2400 // frequency is larger than top fallthrough.
2401 BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2402 if (FallThrough2Top >= ExitFreq)
2403 return;
2404 }
2405
2406 BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2407 if (ExitIt == LoopChain.end())
2408 return;
2409
2410 // Rotating a loop exit to the bottom when there is a fallthrough to top
2411 // trades the entry fallthrough for an exit fallthrough.
2412 // If there is no bottom->top edge, but the chosen exit block does have
2413 // a fallthrough, we break that fallthrough for nothing in return.
2414
2415 // Let's consider an example. We have a built chain of basic blocks
2416 // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2417 // By doing a rotation we get
2418 // Bk+1, ..., Bn, B1, ..., Bk
2419 // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2420 // If we had a fallthrough Bk -> Bk+1 it is broken now.
2421 // It might be compensated by fallthrough Bn -> B1.
2422 // So we have a condition to avoid creation of extra branch by loop rotation.
2423 // All below must be true to avoid loop rotation:
2424 // If there is a fallthrough to top (B1)
2425 // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2426 // There is no fallthrough from bottom (Bn) to top (B1).
2427 // Please note that there is no exit fallthrough from Bn because we checked it
2428 // above.
2429 if (ViableTopFallthrough) {
2430 assert(std::next(ExitIt) != LoopChain.end() &&
2431 "Exit should not be last BB");
2432 MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2433 if (ExitingBB->isSuccessor(NextBlockInChain))
2434 if (!Bottom->isSuccessor(Top))
2435 return;
2436 }
2437
2438 LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2439 << " at bottom\n");
2440 std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2441}
2442
2443/// Attempt to rotate a loop based on profile data to reduce branch cost.
2444///
2445/// With profile data, we can determine the cost in terms of missed fall through
2446/// opportunities when rotating a loop chain and select the best rotation.
2447/// Basically, there are three kinds of cost to consider for each rotation:
2448/// 1. The possibly missed fall through edge (if it exists) from BB out of
2449/// the loop to the loop header.
2450/// 2. The possibly missed fall through edges (if they exist) from the loop
2451/// exits to BB out of the loop.
2452/// 3. The missed fall through edge (if it exists) from the last BB to the
2453/// first BB in the loop chain.
2454/// Therefore, the cost for a given rotation is the sum of costs listed above.
2455/// We select the best rotation with the smallest cost.
2456void MachineBlockPlacement::rotateLoopWithProfile(
2457 BlockChain &LoopChain, const MachineLoop &L,
2458 const BlockFilterSet &LoopBlockSet) {
2459 auto RotationPos = LoopChain.end();
2460 MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2461
2462 // The entry block should always be the first BB in a function.
2463 if (ChainHeaderBB->isEntryBlock())
2464 return;
2465
2466 BlockFrequency SmallestRotationCost = BlockFrequency::max();
2467
2468 // A utility lambda that scales up a block frequency by dividing it by a
2469 // branch probability which is the reciprocal of the scale.
2470 auto ScaleBlockFrequency = [](BlockFrequency Freq,
2471 unsigned Scale) -> BlockFrequency {
2472 if (Scale == 0)
2473 return BlockFrequency(0);
2474 // Use operator / between BlockFrequency and BranchProbability to implement
2475 // saturating multiplication.
2476 return Freq / BranchProbability(1, Scale);
2477 };
2478
2479 // Compute the cost of the missed fall-through edge to the loop header if the
2480 // chain head is not the loop header. As we only consider natural loops with
2481 // single header, this computation can be done only once.
2482 BlockFrequency HeaderFallThroughCost(0);
2483 for (auto *Pred : ChainHeaderBB->predecessors()) {
2484 BlockChain *PredChain = BlockToChain[Pred];
2485 if (!LoopBlockSet.count(Pred) &&
2486 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2487 auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2488 MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2489 auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2490 // If the predecessor has only an unconditional jump to the header, we
2491 // need to consider the cost of this jump.
2492 if (Pred->succ_size() == 1)
2493 FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2494 HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2495 }
2496 }
2497
2498 // Here we collect all exit blocks in the loop, and for each exit we find out
2499 // its hottest exit edge. For each loop rotation, we define the loop exit cost
2500 // as the sum of frequencies of exit edges we collect here, excluding the exit
2501 // edge from the tail of the loop chain.
2503 for (auto *BB : LoopChain) {
2504 auto LargestExitEdgeProb = BranchProbability::getZero();
2505 for (auto *Succ : BB->successors()) {
2506 BlockChain *SuccChain = BlockToChain[Succ];
2507 if (!LoopBlockSet.count(Succ) &&
2508 (!SuccChain || Succ == *SuccChain->begin())) {
2509 auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2510 LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2511 }
2512 }
2513 if (LargestExitEdgeProb > BranchProbability::getZero()) {
2514 auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2515 ExitsWithFreq.emplace_back(BB, ExitFreq);
2516 }
2517 }
2518
2519 // In this loop we iterate every block in the loop chain and calculate the
2520 // cost assuming the block is the head of the loop chain. When the loop ends,
2521 // we should have found the best candidate as the loop chain's head.
2522 for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2523 EndIter = LoopChain.end();
2524 Iter != EndIter; Iter++, TailIter++) {
2525 // TailIter is used to track the tail of the loop chain if the block we are
2526 // checking (pointed by Iter) is the head of the chain.
2527 if (TailIter == LoopChain.end())
2528 TailIter = LoopChain.begin();
2529
2530 auto TailBB = *TailIter;
2531
2532 // Calculate the cost by putting this BB to the top.
2534
2535 // If the current BB is the loop header, we need to take into account the
2536 // cost of the missed fall through edge from outside of the loop to the
2537 // header.
2538 if (Iter != LoopChain.begin())
2539 Cost += HeaderFallThroughCost;
2540
2541 // Collect the loop exit cost by summing up frequencies of all exit edges
2542 // except the one from the chain tail.
2543 for (auto &ExitWithFreq : ExitsWithFreq)
2544 if (TailBB != ExitWithFreq.first)
2545 Cost += ExitWithFreq.second;
2546
2547 // The cost of breaking the once fall-through edge from the tail to the top
2548 // of the loop chain. Here we need to consider three cases:
2549 // 1. If the tail node has only one successor, then we will get an
2550 // additional jmp instruction. So the cost here is (MisfetchCost +
2551 // JumpInstCost) * tail node frequency.
2552 // 2. If the tail node has two successors, then we may still get an
2553 // additional jmp instruction if the layout successor after the loop
2554 // chain is not its CFG successor. Note that the more frequently executed
2555 // jmp instruction will be put ahead of the other one. Assume the
2556 // frequency of those two branches are x and y, where x is the frequency
2557 // of the edge to the chain head, then the cost will be
2558 // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2559 // 3. If the tail node has more than two successors (this rarely happens),
2560 // we won't consider any additional cost.
2561 if (TailBB->isSuccessor(*Iter)) {
2562 auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2563 if (TailBB->succ_size() == 1)
2564 Cost += ScaleBlockFrequency(TailBBFreq, MisfetchCost + JumpInstCost);
2565 else if (TailBB->succ_size() == 2) {
2566 auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2567 auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2568 auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2569 ? TailBBFreq * TailToHeadProb.getCompl()
2570 : TailToHeadFreq;
2571 Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2572 ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2573 }
2574 }
2575
2576 LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2577 << getBlockName(*Iter) << " to the top: "
2578 << printBlockFreq(MBFI->getMBFI(), Cost) << "\n");
2579
2580 if (Cost < SmallestRotationCost) {
2581 SmallestRotationCost = Cost;
2582 RotationPos = Iter;
2583 }
2584 }
2585
2586 if (RotationPos != LoopChain.end()) {
2587 LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2588 << " to the top\n");
2589 std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2590 }
2591}
2592
2593/// Collect blocks in the given loop that are to be placed.
2594///
2595/// When profile data is available, exclude cold blocks from the returned set;
2596/// otherwise, collect all blocks in the loop.
2598MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2599 BlockFilterSet LoopBlockSet;
2600
2601 // Filter cold blocks off from LoopBlockSet when profile data is available.
2602 // Collect the sum of frequencies of incoming edges to the loop header from
2603 // outside. If we treat the loop as a super block, this is the frequency of
2604 // the loop. Then for each block in the loop, we calculate the ratio between
2605 // its frequency and the frequency of the loop block. When it is too small,
2606 // don't add it to the loop chain. If there are outer loops, then this block
2607 // will be merged into the first outer loop chain for which this block is not
2608 // cold anymore. This needs precise profile data and we only do this when
2609 // profile data is available.
2610 if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
2611 BlockFrequency LoopFreq(0);
2612 for (auto *LoopPred : L.getHeader()->predecessors())
2613 if (!L.contains(LoopPred))
2614 LoopFreq += MBFI->getBlockFreq(LoopPred) *
2615 MBPI->getEdgeProbability(LoopPred, L.getHeader());
2616
2617 for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2618 if (LoopBlockSet.count(LoopBB))
2619 continue;
2620 auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2621 if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2622 continue;
2623 BlockChain *Chain = BlockToChain[LoopBB];
2624 for (MachineBasicBlock *ChainBB : *Chain)
2625 LoopBlockSet.insert(ChainBB);
2626 }
2627 } else
2628 LoopBlockSet.insert(L.block_begin(), L.block_end());
2629
2630 return LoopBlockSet;
2631}
2632
2633/// Forms basic block chains from the natural loop structures.
2634///
2635/// These chains are designed to preserve the existing *structure* of the code
2636/// as much as possible. We can then stitch the chains together in a way which
2637/// both preserves the topological structure and minimizes taken conditional
2638/// branches.
2639void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2640 // First recurse through any nested loops, building chains for those inner
2641 // loops.
2642 for (const MachineLoop *InnerLoop : L)
2643 buildLoopChains(*InnerLoop);
2644
2645 assert(BlockWorkList.empty() &&
2646 "BlockWorkList not empty when starting to build loop chains.");
2647 assert(EHPadWorkList.empty() &&
2648 "EHPadWorkList not empty when starting to build loop chains.");
2649 BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2650
2651 // Check if we have profile data for this function. If yes, we will rotate
2652 // this loop by modeling costs more precisely which requires the profile data
2653 // for better layout.
2654 bool RotateLoopWithProfile =
2656 (PreciseRotationCost && F->getFunction().hasProfileData());
2657
2658 // First check to see if there is an obviously preferable top block for the
2659 // loop. This will default to the header, but may end up as one of the
2660 // predecessors to the header if there is one which will result in strictly
2661 // fewer branches in the loop body.
2662 MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2663
2664 // If we selected just the header for the loop top, look for a potentially
2665 // profitable exit block in the event that rotating the loop can eliminate
2666 // branches by placing an exit edge at the bottom.
2667 //
2668 // Loops are processed innermost to uttermost, make sure we clear
2669 // PreferredLoopExit before processing a new loop.
2670 PreferredLoopExit = nullptr;
2671 BlockFrequency ExitFreq;
2672 if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2673 PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2674
2675 BlockChain &LoopChain = *BlockToChain[LoopTop];
2676
2677 // FIXME: This is a really lame way of walking the chains in the loop: we
2678 // walk the blocks, and use a set to prevent visiting a particular chain
2679 // twice.
2680 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2681 assert(LoopChain.UnscheduledPredecessors == 0 &&
2682 "LoopChain should not have unscheduled predecessors.");
2683 UpdatedPreds.insert(&LoopChain);
2684
2685 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2686 fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2687
2688 buildChain(LoopTop, LoopChain, &LoopBlockSet);
2689
2690 if (RotateLoopWithProfile)
2691 rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2692 else
2693 rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2694
2695 LLVM_DEBUG({
2696 // Crash at the end so we get all of the debugging output first.
2697 bool BadLoop = false;
2698 if (LoopChain.UnscheduledPredecessors) {
2699 BadLoop = true;
2700 dbgs() << "Loop chain contains a block without its preds placed!\n"
2701 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2702 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2703 }
2704 for (MachineBasicBlock *ChainBB : LoopChain) {
2705 dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2706 if (!LoopBlockSet.remove(ChainBB)) {
2707 // We don't mark the loop as bad here because there are real situations
2708 // where this can occur. For example, with an unanalyzable fallthrough
2709 // from a loop block to a non-loop block or vice versa.
2710 dbgs() << "Loop chain contains a block not contained by the loop!\n"
2711 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2712 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2713 << " Bad block: " << getBlockName(ChainBB) << "\n";
2714 }
2715 }
2716
2717 if (!LoopBlockSet.empty()) {
2718 BadLoop = true;
2719 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2720 dbgs() << "Loop contains blocks never placed into a chain!\n"
2721 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2722 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2723 << " Bad block: " << getBlockName(LoopBB) << "\n";
2724 }
2725 assert(!BadLoop && "Detected problems with the placement of this loop.");
2726 });
2727
2728 BlockWorkList.clear();
2729 EHPadWorkList.clear();
2730}
2731
2732void MachineBlockPlacement::buildCFGChains() {
2733 // Ensure that every BB in the function has an associated chain to simplify
2734 // the assumptions of the remaining algorithm.
2735 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2736 for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2737 ++FI) {
2738 MachineBasicBlock *BB = &*FI;
2739 BlockChain *Chain =
2740 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2741 // Also, merge any blocks which we cannot reason about and must preserve
2742 // the exact fallthrough behavior for.
2743 while (true) {
2744 Cond.clear();
2745 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2746 if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2747 break;
2748
2749 MachineFunction::iterator NextFI = std::next(FI);
2750 MachineBasicBlock *NextBB = &*NextFI;
2751 // Ensure that the layout successor is a viable block, as we know that
2752 // fallthrough is a possibility.
2753 assert(NextFI != FE && "Can't fallthrough past the last block.");
2754 LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2755 << getBlockName(BB) << " -> " << getBlockName(NextBB)
2756 << "\n");
2757 Chain->merge(NextBB, nullptr);
2758#ifndef NDEBUG
2759 BlocksWithUnanalyzableExits.insert(&*BB);
2760#endif
2761 FI = NextFI;
2762 BB = NextBB;
2763 }
2764 }
2765
2766 // Build any loop-based chains.
2767 PreferredLoopExit = nullptr;
2768 for (MachineLoop *L : *MLI)
2769 buildLoopChains(*L);
2770
2771 assert(BlockWorkList.empty() &&
2772 "BlockWorkList should be empty before building final chain.");
2773 assert(EHPadWorkList.empty() &&
2774 "EHPadWorkList should be empty before building final chain.");
2775
2776 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2777 for (MachineBasicBlock &MBB : *F)
2778 fillWorkLists(&MBB, UpdatedPreds);
2779
2780 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2781 buildChain(&F->front(), FunctionChain);
2782
2783#ifndef NDEBUG
2784 using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2785#endif
2786 LLVM_DEBUG({
2787 // Crash at the end so we get all of the debugging output first.
2788 bool BadFunc = false;
2789 FunctionBlockSetType FunctionBlockSet;
2790 for (MachineBasicBlock &MBB : *F)
2791 FunctionBlockSet.insert(&MBB);
2792
2793 for (MachineBasicBlock *ChainBB : FunctionChain)
2794 if (!FunctionBlockSet.erase(ChainBB)) {
2795 BadFunc = true;
2796 dbgs() << "Function chain contains a block not in the function!\n"
2797 << " Bad block: " << getBlockName(ChainBB) << "\n";
2798 }
2799
2800 if (!FunctionBlockSet.empty()) {
2801 BadFunc = true;
2802 for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2803 dbgs() << "Function contains blocks never placed into a chain!\n"
2804 << " Bad block: " << getBlockName(RemainingBB) << "\n";
2805 }
2806 assert(!BadFunc && "Detected problems with the block placement.");
2807 });
2808
2809 // Remember original layout ordering, so we can update terminators after
2810 // reordering to point to the original layout successor.
2811 SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
2812 F->getNumBlockIDs());
2813 {
2814 MachineBasicBlock *LastMBB = nullptr;
2815 for (auto &MBB : *F) {
2816 if (LastMBB != nullptr)
2817 OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
2818 LastMBB = &MBB;
2819 }
2820 OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
2821 }
2822
2823 // Splice the blocks into place.
2824 MachineFunction::iterator InsertPos = F->begin();
2825 LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2826 for (MachineBasicBlock *ChainBB : FunctionChain) {
2827 LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2828 : " ... ")
2829 << getBlockName(ChainBB) << "\n");
2830 if (InsertPos != MachineFunction::iterator(ChainBB))
2831 F->splice(InsertPos, ChainBB);
2832 else
2833 ++InsertPos;
2834
2835 // Update the terminator of the previous block.
2836 if (ChainBB == *FunctionChain.begin())
2837 continue;
2838 MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2839
2840 // FIXME: It would be awesome of updateTerminator would just return rather
2841 // than assert when the branch cannot be analyzed in order to remove this
2842 // boiler plate.
2843 Cond.clear();
2844 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2845
2846#ifndef NDEBUG
2847 if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2848 // Given the exact block placement we chose, we may actually not _need_ to
2849 // be able to edit PrevBB's terminator sequence, but not being _able_ to
2850 // do that at this point is a bug.
2851 assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2852 !PrevBB->canFallThrough()) &&
2853 "Unexpected block with un-analyzable fallthrough!");
2854 Cond.clear();
2855 TBB = FBB = nullptr;
2856 }
2857#endif
2858
2859 // The "PrevBB" is not yet updated to reflect current code layout, so,
2860 // o. it may fall-through to a block without explicit "goto" instruction
2861 // before layout, and no longer fall-through it after layout; or
2862 // o. just opposite.
2863 //
2864 // analyzeBranch() may return erroneous value for FBB when these two
2865 // situations take place. For the first scenario FBB is mistakenly set NULL;
2866 // for the 2nd scenario, the FBB, which is expected to be NULL, is
2867 // mistakenly pointing to "*BI".
2868 // Thus, if the future change needs to use FBB before the layout is set, it
2869 // has to correct FBB first by using the code similar to the following:
2870 //
2871 // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2872 // PrevBB->updateTerminator();
2873 // Cond.clear();
2874 // TBB = FBB = nullptr;
2875 // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2876 // // FIXME: This should never take place.
2877 // TBB = FBB = nullptr;
2878 // }
2879 // }
2880 if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2881 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2882 }
2883 }
2884
2885 // Fixup the last block.
2886 Cond.clear();
2887 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2888 if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
2889 MachineBasicBlock *PrevBB = &F->back();
2890 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2891 }
2892
2893 BlockWorkList.clear();
2894 EHPadWorkList.clear();
2895}
2896
2897void MachineBlockPlacement::optimizeBranches() {
2898 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2899 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2900
2901 // Now that all the basic blocks in the chain have the proper layout,
2902 // make a final call to analyzeBranch with AllowModify set.
2903 // Indeed, the target may be able to optimize the branches in a way we
2904 // cannot because all branches may not be analyzable.
2905 // E.g., the target may be able to remove an unconditional branch to
2906 // a fallthrough when it occurs after predicated terminators.
2907 for (MachineBasicBlock *ChainBB : FunctionChain) {
2908 Cond.clear();
2909 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2910 if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
2911 // If PrevBB has a two-way branch, try to re-order the branches
2912 // such that we branch to the successor with higher probability first.
2913 if (TBB && !Cond.empty() && FBB &&
2914 MBPI->getEdgeProbability(ChainBB, FBB) >
2915 MBPI->getEdgeProbability(ChainBB, TBB) &&
2917 LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2918 << getBlockName(ChainBB) << "\n");
2919 LLVM_DEBUG(dbgs() << " Edge probability: "
2920 << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
2921 << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
2922 DebugLoc dl; // FIXME: this is nowhere
2923 TII->removeBranch(*ChainBB);
2924 TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
2925 }
2926 }
2927 }
2928}
2929
2930void MachineBlockPlacement::alignBlocks() {
2931 // Walk through the backedges of the function now that we have fully laid out
2932 // the basic blocks and align the destination of each backedge. We don't rely
2933 // exclusively on the loop info here so that we can align backedges in
2934 // unnatural CFGs and backedges that were introduced purely because of the
2935 // loop rotations done during this layout pass.
2936 if (F->getFunction().hasMinSize() ||
2937 (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
2938 return;
2939 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2940 if (FunctionChain.begin() == FunctionChain.end())
2941 return; // Empty chain.
2942
2943 const BranchProbability ColdProb(1, 5); // 20%
2944 BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2945 BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2946 for (MachineBasicBlock *ChainBB : FunctionChain) {
2947 if (ChainBB == *FunctionChain.begin())
2948 continue;
2949
2950 // Don't align non-looping basic blocks. These are unlikely to execute
2951 // enough times to matter in practice. Note that we'll still handle
2952 // unnatural CFGs inside of a natural outer loop (the common case) and
2953 // rotated loops.
2954 MachineLoop *L = MLI->getLoopFor(ChainBB);
2955 if (!L)
2956 continue;
2957
2958 const Align TLIAlign = TLI->getPrefLoopAlignment(L);
2959 unsigned MDAlign = 1;
2960 MDNode *LoopID = L->getLoopID();
2961 if (LoopID) {
2962 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
2963 MDNode *MD = dyn_cast<MDNode>(MDO);
2964 if (MD == nullptr)
2965 continue;
2966 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
2967 if (S == nullptr)
2968 continue;
2969 if (S->getString() == "llvm.loop.align") {
2970 assert(MD->getNumOperands() == 2 &&
2971 "per-loop align metadata should have two operands.");
2972 MDAlign =
2973 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
2974 assert(MDAlign >= 1 && "per-loop align value must be positive.");
2975 }
2976 }
2977 }
2978
2979 // Use max of the TLIAlign and MDAlign
2980 const Align LoopAlign = std::max(TLIAlign, Align(MDAlign));
2981 if (LoopAlign == 1)
2982 continue; // Don't care about loop alignment.
2983
2984 // If the block is cold relative to the function entry don't waste space
2985 // aligning it.
2986 BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
2987 if (Freq < WeightedEntryFreq)
2988 continue;
2989
2990 // If the block is cold relative to its loop header, don't align it
2991 // regardless of what edges into the block exist.
2992 MachineBasicBlock *LoopHeader = L->getHeader();
2993 BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
2994 if (Freq < (LoopHeaderFreq * ColdProb))
2995 continue;
2996
2997 // If the global profiles indicates so, don't align it.
2998 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
2999 !TLI->alignLoopsWithOptSize())
3000 continue;
3001
3002 // Check for the existence of a non-layout predecessor which would benefit
3003 // from aligning this block.
3004 MachineBasicBlock *LayoutPred =
3005 &*std::prev(MachineFunction::iterator(ChainBB));
3006
3007 auto DetermineMaxAlignmentPadding = [&]() {
3008 // Set the maximum bytes allowed to be emitted for alignment.
3009 unsigned MaxBytes;
3010 if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
3012 else
3013 MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
3014 ChainBB->setMaxBytesForAlignment(MaxBytes);
3015 };
3016
3017 // Force alignment if all the predecessors are jumps. We already checked
3018 // that the block isn't cold above.
3019 if (!LayoutPred->isSuccessor(ChainBB)) {
3020 ChainBB->setAlignment(LoopAlign);
3021 DetermineMaxAlignmentPadding();
3022 continue;
3023 }
3024
3025 // Align this block if the layout predecessor's edge into this block is
3026 // cold relative to the block. When this is true, other predecessors make up
3027 // all of the hot entries into the block and thus alignment is likely to be
3028 // important.
3029 BranchProbability LayoutProb =
3030 MBPI->getEdgeProbability(LayoutPred, ChainBB);
3031 BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
3032 if (LayoutEdgeFreq <= (Freq * ColdProb)) {
3033 ChainBB->setAlignment(LoopAlign);
3034 DetermineMaxAlignmentPadding();
3035 }
3036 }
3037}
3038
3039/// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
3040/// it was duplicated into its chain predecessor and removed.
3041/// \p BB - Basic block that may be duplicated.
3042///
3043/// \p LPred - Chosen layout predecessor of \p BB.
3044/// Updated to be the chain end if LPred is removed.
3045/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3046/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3047/// Used to identify which blocks to update predecessor
3048/// counts.
3049/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3050/// chosen in the given order due to unnatural CFG
3051/// only needed if \p BB is removed and
3052/// \p PrevUnplacedBlockIt pointed to \p BB.
3053/// @return true if \p BB was removed.
3054bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
3056 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
3057 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3058 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt) {
3059 bool Removed, DuplicatedToLPred;
3060 bool DuplicatedToOriginalLPred;
3061 Removed = maybeTailDuplicateBlock(
3062 BB, LPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3063 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3064 if (!Removed)
3065 return false;
3066 DuplicatedToOriginalLPred = DuplicatedToLPred;
3067 // Iteratively try to duplicate again. It can happen that a block that is
3068 // duplicated into is still small enough to be duplicated again.
3069 // No need to call markBlockSuccessors in this case, as the blocks being
3070 // duplicated from here on are already scheduled.
3071 while (DuplicatedToLPred && Removed) {
3072 MachineBasicBlock *DupBB, *DupPred;
3073 // The removal callback causes Chain.end() to be updated when a block is
3074 // removed. On the first pass through the loop, the chain end should be the
3075 // same as it was on function entry. On subsequent passes, because we are
3076 // duplicating the block at the end of the chain, if it is removed the
3077 // chain will have shrunk by one block.
3078 BlockChain::iterator ChainEnd = Chain.end();
3079 DupBB = *(--ChainEnd);
3080 // Now try to duplicate again.
3081 if (ChainEnd == Chain.begin())
3082 break;
3083 DupPred = *std::prev(ChainEnd);
3084 Removed = maybeTailDuplicateBlock(
3085 DupBB, DupPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3086 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3087 }
3088 // If BB was duplicated into LPred, it is now scheduled. But because it was
3089 // removed, markChainSuccessors won't be called for its chain. Instead we
3090 // call markBlockSuccessors for LPred to achieve the same effect. This must go
3091 // at the end because repeating the tail duplication can increase the number
3092 // of unscheduled predecessors.
3093 LPred = *std::prev(Chain.end());
3094 if (DuplicatedToOriginalLPred)
3095 markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
3096 return true;
3097}
3098
3099/// Tail duplicate \p BB into (some) predecessors if profitable.
3100/// \p BB - Basic block that may be duplicated
3101/// \p LPred - Chosen layout predecessor of \p BB
3102/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3103/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3104/// Used to identify which blocks to update predecessor
3105/// counts.
3106/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3107/// chosen in the given order due to unnatural CFG
3108/// only needed if \p BB is removed and
3109/// \p PrevUnplacedBlockIt pointed to \p BB.
3110/// \p DuplicatedToLPred - True if the block was duplicated into LPred.
3111/// \return - True if the block was duplicated into all preds and removed.
3112bool MachineBlockPlacement::maybeTailDuplicateBlock(
3113 MachineBasicBlock *BB, MachineBasicBlock *LPred, BlockChain &Chain,
3114 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3115 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
3116 bool &DuplicatedToLPred) {
3117 DuplicatedToLPred = false;
3118 if (!shouldTailDuplicate(BB))
3119 return false;
3120
3121 LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
3122 << "\n");
3123
3124 // This has to be a callback because none of it can be done after
3125 // BB is deleted.
3126 bool Removed = false;
3127 auto RemovalCallback =
3128 [&](MachineBasicBlock *RemBB) {
3129 // Signal to outer function
3130 Removed = true;
3131
3132 // Conservative default.
3133 bool InWorkList = true;
3134 // Remove from the Chain and Chain Map
3135 if (BlockToChain.count(RemBB)) {
3136 BlockChain *Chain = BlockToChain[RemBB];
3137 InWorkList = Chain->UnscheduledPredecessors == 0;
3138 Chain->remove(RemBB);
3139 BlockToChain.erase(RemBB);
3140 }
3141
3142 // Handle the unplaced block iterator
3143 if (&(*PrevUnplacedBlockIt) == RemBB) {
3144 PrevUnplacedBlockIt++;
3145 }
3146
3147 // Handle the Work Lists
3148 if (InWorkList) {
3149 SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
3150 if (RemBB->isEHPad())
3151 RemoveList = EHPadWorkList;
3152 llvm::erase(RemoveList, RemBB);
3153 }
3154
3155 // Handle the filter set
3156 if (BlockFilter) {
3157 auto It = llvm::find(*BlockFilter, RemBB);
3158 // Erase RemBB from BlockFilter, and keep PrevUnplacedBlockInFilterIt
3159 // pointing to the same element as before.
3160 if (It != BlockFilter->end()) {
3161 if (It < PrevUnplacedBlockInFilterIt) {
3162 const MachineBasicBlock *PrevBB = *PrevUnplacedBlockInFilterIt;
3163 // BlockFilter is a SmallVector so all elements after RemBB are
3164 // shifted to the front by 1 after its deletion.
3165 auto Distance = PrevUnplacedBlockInFilterIt - It - 1;
3166 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It) + Distance;
3167 assert(*PrevUnplacedBlockInFilterIt == PrevBB);
3168 (void)PrevBB;
3169 } else if (It == PrevUnplacedBlockInFilterIt)
3170 // The block pointed by PrevUnplacedBlockInFilterIt is erased, we
3171 // have to set it to the next element.
3172 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It);
3173 else
3174 BlockFilter->erase(It);
3175 }
3176 }
3177
3178 // Remove the block from loop info.
3179 MLI->removeBlock(RemBB);
3180 if (RemBB == PreferredLoopExit)
3181 PreferredLoopExit = nullptr;
3182
3183 LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
3184 << getBlockName(RemBB) << "\n");
3185 };
3186 auto RemovalCallbackRef =
3188
3190 bool IsSimple = TailDup.isSimpleBB(BB);
3192 SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
3193 if (F->getFunction().hasProfileData()) {
3194 // We can do partial duplication with precise profile information.
3195 findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
3196 if (CandidatePreds.size() == 0)
3197 return false;
3198 if (CandidatePreds.size() < BB->pred_size())
3199 CandidatePtr = &CandidatePreds;
3200 }
3201 TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
3202 &RemovalCallbackRef, CandidatePtr);
3203
3204 // Update UnscheduledPredecessors to reflect tail-duplication.
3205 DuplicatedToLPred = false;
3206 for (MachineBasicBlock *Pred : DuplicatedPreds) {
3207 // We're only looking for unscheduled predecessors that match the filter.
3208 BlockChain* PredChain = BlockToChain[Pred];
3209 if (Pred == LPred)
3210 DuplicatedToLPred = true;
3211 if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
3212 || PredChain == &Chain)
3213 continue;
3214 for (MachineBasicBlock *NewSucc : Pred->successors()) {
3215 if (BlockFilter && !BlockFilter->count(NewSucc))
3216 continue;
3217 BlockChain *NewChain = BlockToChain[NewSucc];
3218 if (NewChain != &Chain && NewChain != PredChain)
3219 NewChain->UnscheduledPredecessors++;
3220 }
3221 }
3222 return Removed;
3223}
3224
3225// Count the number of actual machine instructions.
3227 uint64_t InstrCount = 0;
3228 for (MachineInstr &MI : *MBB) {
3229 if (!MI.isPHI() && !MI.isMetaInstruction())
3230 InstrCount += 1;
3231 }
3232 return InstrCount;
3233}
3234
3235// The size cost of duplication is the instruction size of the duplicated block.
3236// So we should scale the threshold accordingly. But the instruction size is not
3237// available on all targets, so we use the number of instructions instead.
3238BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3239 return BlockFrequency(DupThreshold.getFrequency() * countMBBInstruction(BB));
3240}
3241
3242// Returns true if BB is Pred's best successor.
3243bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3244 MachineBasicBlock *Pred,
3245 BlockFilterSet *BlockFilter) {
3246 if (BB == Pred)
3247 return false;
3248 if (BlockFilter && !BlockFilter->count(Pred))
3249 return false;
3250 BlockChain *PredChain = BlockToChain[Pred];
3251 if (PredChain && (Pred != *std::prev(PredChain->end())))
3252 return false;
3253
3254 // Find the successor with largest probability excluding BB.
3256 for (MachineBasicBlock *Succ : Pred->successors())
3257 if (Succ != BB) {
3258 if (BlockFilter && !BlockFilter->count(Succ))
3259 continue;
3260 BlockChain *SuccChain = BlockToChain[Succ];
3261 if (SuccChain && (Succ != *SuccChain->begin()))
3262 continue;
3263 BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
3264 if (SuccProb > BestProb)
3265 BestProb = SuccProb;
3266 }
3267
3268 BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
3269 if (BBProb <= BestProb)
3270 return false;
3271
3272 // Compute the number of reduced taken branches if Pred falls through to BB
3273 // instead of another successor. Then compare it with threshold.
3274 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3275 BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3276 return Gain > scaleThreshold(BB);
3277}
3278
3279// Find out the predecessors of BB and BB can be beneficially duplicated into
3280// them.
3281void MachineBlockPlacement::findDuplicateCandidates(
3284 BlockFilterSet *BlockFilter) {
3285 MachineBasicBlock *Fallthrough = nullptr;
3286 BranchProbability DefaultBranchProb = BranchProbability::getZero();
3287 BlockFrequency BBDupThreshold(scaleThreshold(BB));
3290
3291 // Sort for highest frequency.
3292 auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3293 return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
3294 };
3295 auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3296 return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
3297 };
3298 llvm::stable_sort(Succs, CmpSucc);
3299 llvm::stable_sort(Preds, CmpPred);
3300
3301 auto SuccIt = Succs.begin();
3302 if (SuccIt != Succs.end()) {
3303 DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
3304 }
3305
3306 // For each predecessors of BB, compute the benefit of duplicating BB,
3307 // if it is larger than the threshold, add it into Candidates.
3308 //
3309 // If we have following control flow.
3310 //
3311 // PB1 PB2 PB3 PB4
3312 // \ | / /\
3313 // \ | / / \
3314 // \ |/ / \
3315 // BB----/ OB
3316 // /\
3317 // / \
3318 // SB1 SB2
3319 //
3320 // And it can be partially duplicated as
3321 //
3322 // PB2+BB
3323 // | PB1 PB3 PB4
3324 // | | / /\
3325 // | | / / \
3326 // | |/ / \
3327 // | BB----/ OB
3328 // |\ /|
3329 // | X |
3330 // |/ \|
3331 // SB2 SB1
3332 //
3333 // The benefit of duplicating into a predecessor is defined as
3334 // Orig_taken_branch - Duplicated_taken_branch
3335 //
3336 // The Orig_taken_branch is computed with the assumption that predecessor
3337 // jumps to BB and the most possible successor is laid out after BB.
3338 //
3339 // The Duplicated_taken_branch is computed with the assumption that BB is
3340 // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3341 // SB2 for PB2 in our case). If there is no available successor, the combined
3342 // block jumps to all BB's successor, like PB3 in this example.
3343 //
3344 // If a predecessor has multiple successors, so BB can't be duplicated into
3345 // it. But it can beneficially fall through to BB, and duplicate BB into other
3346 // predecessors.
3347 for (MachineBasicBlock *Pred : Preds) {
3348 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3349
3350 if (!TailDup.canTailDuplicate(BB, Pred)) {
3351 // BB can't be duplicated into Pred, but it is possible to be layout
3352 // below Pred.
3353 if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3354 Fallthrough = Pred;
3355 if (SuccIt != Succs.end())
3356 SuccIt++;
3357 }
3358 continue;
3359 }
3360
3361 BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3362 BlockFrequency DupCost;
3363 if (SuccIt == Succs.end()) {
3364 // Jump to all successors;
3365 if (Succs.size() > 0)
3366 DupCost += PredFreq;
3367 } else {
3368 // Fallthrough to *SuccIt, jump to all other successors;
3369 DupCost += PredFreq;
3370 DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
3371 }
3372
3373 assert(OrigCost >= DupCost);
3374 OrigCost -= DupCost;
3375 if (OrigCost > BBDupThreshold) {
3376 Candidates.push_back(Pred);
3377 if (SuccIt != Succs.end())
3378 SuccIt++;
3379 }
3380 }
3381
3382 // No predecessors can optimally fallthrough to BB.
3383 // So we can change one duplication into fallthrough.
3384 if (!Fallthrough) {
3385 if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
3386 Candidates[0] = Candidates.back();
3387 Candidates.pop_back();
3388 }
3389 }
3390}
3391
3392void MachineBlockPlacement::initDupThreshold() {
3393 DupThreshold = BlockFrequency(0);
3394 if (!F->getFunction().hasProfileData())
3395 return;
3396
3397 // We prefer to use prifile count.
3398 uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
3399 if (HotThreshold != UINT64_MAX) {
3400 UseProfileCount = true;
3401 DupThreshold =
3402 BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100);
3403 return;
3404 }
3405
3406 // Profile count is not available, we can use block frequency instead.
3407 BlockFrequency MaxFreq = BlockFrequency(0);
3408 for (MachineBasicBlock &MBB : *F) {
3409 BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
3410 if (Freq > MaxFreq)
3411 MaxFreq = Freq;
3412 }
3413
3414 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
3415 DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
3416 UseProfileCount = false;
3417}
3418
3419bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
3420 if (skipFunction(MF.getFunction()))
3421 return false;
3422
3423 // Check for single-block functions and skip them.
3424 if (std::next(MF.begin()) == MF.end())
3425 return false;
3426
3427 F = &MF;
3428 MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
3429 MBFI = std::make_unique<MBFIWrapper>(
3430 getAnalysis<MachineBlockFrequencyInfo>());
3431 MLI = &getAnalysis<MachineLoopInfo>();
3432 TII = MF.getSubtarget().getInstrInfo();
3433 TLI = MF.getSubtarget().getTargetLowering();
3434 MPDT = nullptr;
3435 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
3436
3437 initDupThreshold();
3438
3439 // Initialize PreferredLoopExit to nullptr here since it may never be set if
3440 // there are no MachineLoops.
3441 PreferredLoopExit = nullptr;
3442
3443 assert(BlockToChain.empty() &&
3444 "BlockToChain map should be empty before starting placement.");
3445 assert(ComputedEdges.empty() &&
3446 "Computed Edge map should be empty before starting placement.");
3447
3448 unsigned TailDupSize = TailDupPlacementThreshold;
3449 // If only the aggressive threshold is explicitly set, use it.
3450 if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3451 TailDupPlacementThreshold.getNumOccurrences() == 0)
3453
3454 TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
3455 // For aggressive optimization, we can adjust some thresholds to be less
3456 // conservative.
3457 if (PassConfig->getOptLevel() >= CodeGenOptLevel::Aggressive) {
3458 // At O3 we should be more willing to copy blocks for tail duplication. This
3459 // increases size pressure, so we only do it at O3
3460 // Do this unless only the regular threshold is explicitly set.
3461 if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3462 TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3464 }
3465
3466 // If there's no threshold provided through options, query the target
3467 // information for a threshold instead.
3468 if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
3469 (PassConfig->getOptLevel() < CodeGenOptLevel::Aggressive ||
3470 TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
3471 TailDupSize = TII->getTailDuplicateSize(PassConfig->getOptLevel());
3472
3473 if (allowTailDupPlacement()) {
3474 MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3475 bool OptForSize = MF.getFunction().hasOptSize() ||
3476 llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
3477 if (OptForSize)
3478 TailDupSize = 1;
3479 bool PreRegAlloc = false;
3480 TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
3481 /* LayoutMode */ true, TailDupSize);
3482 precomputeTriangleChains();
3483 }
3484
3485 buildCFGChains();
3486
3487 // Changing the layout can create new tail merging opportunities.
3488 // TailMerge can create jump into if branches that make CFG irreducible for
3489 // HW that requires structured CFG.
3490 bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3491 PassConfig->getEnableTailMerge() &&
3493 // No tail merging opportunities if the block number is less than four.
3494 if (MF.size() > 3 && EnableTailMerge) {
3495 unsigned TailMergeSize = TailDupSize + 1;
3496 BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
3497 *MBFI, *MBPI, PSI, TailMergeSize);
3498
3499 if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
3500 /*AfterPlacement=*/true)) {
3501 // Redo the layout if tail merging creates/removes/moves blocks.
3502 BlockToChain.clear();
3503 ComputedEdges.clear();
3504 // Must redo the post-dominator tree if blocks were changed.
3505 if (MPDT)
3506 MPDT->recalculate(MF);
3507 ChainAllocator.DestroyAll();
3508 buildCFGChains();
3509 }
3510 }
3511
3512 // Apply a post-processing optimizing block placement.
3513 if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
3515 // Find a new placement and modify the layout of the blocks in the function.
3516 applyExtTsp();
3517
3518 // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
3519 createCFGChainExtTsp();
3520 }
3521
3522 optimizeBranches();
3523 alignBlocks();
3524
3525 BlockToChain.clear();
3526 ComputedEdges.clear();
3527 ChainAllocator.DestroyAll();
3528
3529 bool HasMaxBytesOverride =
3530 MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
3531
3532 if (AlignAllBlock)
3533 // Align all of the blocks in the function to a specific alignment.
3534 for (MachineBasicBlock &MBB : MF) {
3535 if (HasMaxBytesOverride)
3538 else
3540 }
3541 else if (AlignAllNonFallThruBlocks) {
3542 // Align all of the blocks that have no fall-through predecessors to a
3543 // specific alignment.
3544 for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
3545 auto LayoutPred = std::prev(MBI);
3546 if (!LayoutPred->isSuccessor(&*MBI)) {
3547 if (HasMaxBytesOverride)
3548 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
3550 else
3551 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
3552 }
3553 }
3554 }
3556 (ViewBlockFreqFuncName.empty() ||
3557 F->getFunction().getName() == ViewBlockFreqFuncName)) {
3559 MF.RenumberBlocks();
3560 MBFI->view("MBP." + MF.getName(), false);
3561 }
3562
3563 // We always return true as we have no way to track whether the final order
3564 // differs from the original order.
3565 return true;
3566}
3567
3568void MachineBlockPlacement::applyExtTsp() {
3569 // Prepare data; blocks are indexed by their index in the current ordering.
3571 BlockIndex.reserve(F->size());
3572 std::vector<const MachineBasicBlock *> CurrentBlockOrder;
3573 CurrentBlockOrder.reserve(F->size());
3574 size_t NumBlocks = 0;
3575 for (const MachineBasicBlock &MBB : *F) {
3576 BlockIndex[&MBB] = NumBlocks++;
3577 CurrentBlockOrder.push_back(&MBB);
3578 }
3579
3580 auto BlockSizes = std::vector<uint64_t>(F->size());
3581 auto BlockCounts = std::vector<uint64_t>(F->size());
3582 std::vector<codelayout::EdgeCount> JumpCounts;
3583 for (MachineBasicBlock &MBB : *F) {
3584 // Getting the block frequency.
3585 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3586 BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency();
3587 // Getting the block size:
3588 // - approximate the size of an instruction by 4 bytes, and
3589 // - ignore debug instructions.
3590 // Note: getting the exact size of each block is target-dependent and can be
3591 // done by extending the interface of MCCodeEmitter. Experimentally we do
3592 // not see a perf improvement with the exact block sizes.
3593 auto NonDbgInsts =
3595 int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
3596 BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3597 // Getting jump frequencies.
3598 for (MachineBasicBlock *Succ : MBB.successors()) {
3599 auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3600 BlockFrequency JumpFreq = BlockFreq * EP;
3601 JumpCounts.push_back(
3602 {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
3603 }
3604 }
3605
3606 LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
3607 << " with profile = " << F->getFunction().hasProfileData()
3608 << " (" << F->getName().str() << ")"
3609 << "\n");
3610 LLVM_DEBUG(
3611 dbgs() << format(" original layout score: %0.2f\n",
3612 calcExtTspScore(BlockSizes, BlockCounts, JumpCounts)));
3613
3614 // Run the layout algorithm.
3615 auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
3616 std::vector<const MachineBasicBlock *> NewBlockOrder;
3617 NewBlockOrder.reserve(F->size());
3618 for (uint64_t Node : NewOrder) {
3619 NewBlockOrder.push_back(CurrentBlockOrder[Node]);
3620 }
3621 LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n",
3622 calcExtTspScore(NewOrder, BlockSizes, BlockCounts,
3623 JumpCounts)));
3624
3625 // Assign new block order.
3626 assignBlockOrder(NewBlockOrder);
3627}
3628
3629void MachineBlockPlacement::assignBlockOrder(
3630 const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
3631 assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
3632 F->RenumberBlocks();
3633
3634 bool HasChanges = false;
3635 for (size_t I = 0; I < NewBlockOrder.size(); I++) {
3636 if (NewBlockOrder[I] != F->getBlockNumbered(I)) {
3637 HasChanges = true;
3638 break;
3639 }
3640 }
3641 // Stop early if the new block order is identical to the existing one.
3642 if (!HasChanges)
3643 return;
3644
3645 SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs());
3646 for (auto &MBB : *F) {
3647 PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
3648 }
3649
3650 // Sort basic blocks in the function according to the computed order.
3652 for (const MachineBasicBlock *MBB : NewBlockOrder) {
3653 NewIndex[MBB] = NewIndex.size();
3654 }
3655 F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) {
3656 return NewIndex[&L] < NewIndex[&R];
3657 });
3658
3659 // Update basic block branches by inserting explicit fallthrough branches
3660 // when required and re-optimize branches when possible.
3661 const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
3663 for (auto &MBB : *F) {
3664 MachineFunction::iterator NextMBB = std::next(MBB.getIterator());
3666 auto *FTMBB = PrevFallThroughs[MBB.getNumber()];
3667 // If this block had a fallthrough before we need an explicit unconditional
3668 // branch to that block if the fallthrough block is not adjacent to the
3669 // block in the new order.
3670 if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) {
3671 TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
3672 }
3673
3674 // It might be possible to optimize branches by flipping the condition.
3675 Cond.clear();
3676 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
3677 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3678 continue;
3679 MBB.updateTerminator(FTMBB);
3680 }
3681
3682#ifndef NDEBUG
3683 // Make sure we correctly constructed all branches.
3684 F->verify(this, "After optimized block reordering");
3685#endif
3686}
3687
3688void MachineBlockPlacement::createCFGChainExtTsp() {
3689 BlockToChain.clear();
3690 ComputedEdges.clear();
3691 ChainAllocator.DestroyAll();
3692
3693 MachineBasicBlock *HeadBB = &F->front();
3694 BlockChain *FunctionChain =
3695 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB);
3696
3697 for (MachineBasicBlock &MBB : *F) {
3698 if (HeadBB == &MBB)
3699 continue; // Ignore head of the chain
3700 FunctionChain->merge(&MBB, nullptr);
3701 }
3702}
3703
3704namespace {
3705
3706/// A pass to compute block placement statistics.
3707///
3708/// A separate pass to compute interesting statistics for evaluating block
3709/// placement. This is separate from the actual placement pass so that they can
3710/// be computed in the absence of any placement transformations or when using
3711/// alternative placement strategies.
3712class MachineBlockPlacementStats : public MachineFunctionPass {
3713 /// A handle to the branch probability pass.
3714 const MachineBranchProbabilityInfo *MBPI;
3715
3716 /// A handle to the function-wide block frequency pass.
3717 const MachineBlockFrequencyInfo *MBFI;
3718
3719public:
3720 static char ID; // Pass identification, replacement for typeid
3721
3722 MachineBlockPlacementStats() : MachineFunctionPass(ID) {
3724 }
3725
3726 bool runOnMachineFunction(MachineFunction &F) override;
3727
3728 void getAnalysisUsage(AnalysisUsage &AU) const override {
3731 AU.setPreservesAll();
3733 }
3734};
3735
3736} // end anonymous namespace
3737
3738char MachineBlockPlacementStats::ID = 0;
3739
3740char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStats::ID;
3741
3742INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
3743 "Basic Block Placement Stats", false, false)
3746INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
3748
3749bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
3750 // Check for single-block functions and skip them.
3751 if (std::next(F.begin()) == F.end())
3752 return false;
3753
3754 if (!isFunctionInPrintList(F.getName()))
3755 return false;
3756
3757 MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
3758 MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
3759
3760 for (MachineBasicBlock &MBB : F) {
3761 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3762 Statistic &NumBranches =
3763 (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3764 Statistic &BranchTakenFreq =
3765 (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3766 for (MachineBasicBlock *Succ : MBB.successors()) {
3767 // Skip if this successor is a fallthrough.
3768 if (MBB.isLayoutSuccessor(Succ))
3769 continue;
3770
3771 BlockFrequency EdgeFreq =
3772 BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3773 ++NumBranches;
3774 BranchTakenFreq += EdgeFreq.getFrequency();
3775 }
3776 }
3777
3778 return false;
3779}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
MachineBasicBlock & MBB
This file defines the BumpPtrAllocator interface.
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Declares methods and data structures for code layout algorithms.
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:537
static unsigned InstrCount
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
block placement Basic Block Placement Stats
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
Branch Probability Basic Block Placement
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, BlockFrequency EntryFreq)
Compare 2 BlockFrequency's with a small penalty for A.
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > MaxBytesForAlignmentOverride("max-bytes-for-alignment", cl::desc("Forces the maximum bytes allowed to be emitted when padding for " "alignment"), cl::init(0), cl::Hidden)
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static cl::opt< unsigned > TailDupProfilePercentThreshold("tail-dup-profile-percent-threshold", cl::desc("If profile count information is used in tail duplication cost " "model, the gained fall through number from tail duplication " "should be at least this percent of hot count."), cl::init(50), cl::Hidden)
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock * > &Successors)
Check if BB has exactly the successors in Successors.
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
static uint64_t countMBBInstruction(MachineBasicBlock *MBB)
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
static cl::opt< bool > RenumberBlocksBeforeView("renumber-blocks-before-view", cl::desc("If true, basic blocks are re-numbered before MBP layout is printed " "into a dot graph. Only used when a function is being printed."), cl::init(false), cl::Hidden)
#define DEBUG_TYPE
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
#define P(N)
if(VerifyEach)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
unify loop Fixup each natural loop to have a single exit block
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
static BlockFrequency max()
Returns the maximum possible frequency, the saturation value.
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
static BranchProbability getOne()
uint32_t getNumerator() const
BranchProbability getCompl() const
static BranchProbability getZero()
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
bool erase(const KeyT &Val)
Definition: DenseMap.h:345
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:698
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition: Function.h:326
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
Remove the branching code at the end of the specific MBB.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
Reverses the branch condition of the specified condition list, returning false on success and true if...
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
Insert branch code into the end of the specified MachineBasicBlock.
Metadata node.
Definition: Metadata.h:1067
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:889
A single uniqued string.
Definition: Metadata.h:720
StringRef getString() const
Definition: Metadata.cpp:610
unsigned pred_size() const
bool isEHPad() const
Returns true if the block is a landing pad.
instr_iterator instr_begin()
MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
void updateTerminator(MachineBasicBlock *PreviousLayoutSuccessor)
Update the terminator instructions in block to account for changes to block layout which may have bee...
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
unsigned succ_size() const
void setAlignment(Align A)
Set alignment of the basic block.
bool isEntryBlock() const
Returns true if this is the entry block of the function.
succ_reverse_iterator succ_rbegin()
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
DebugLoc findBranchDebugLoc()
Find and return the merged DebugLoc of the branch instructions of the block.
iterator_range< succ_iterator > successors()
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
iterator_range< pred_iterator > predecessors()
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
unsigned size() const
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Representation of each machine instruction.
Definition: MachineInstr.h:69
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
uint64_t getOrCompHotCountThreshold() const
Returns HotCountThreshold if set.
typename vector_type::const_iterator iterator
Definition: SetVector.h:69
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:323
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:412
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:591
typename SuperClass::iterator iterator
Definition: SmallVector.h:590
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:387
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:437
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:408
Utility class to perform tail duplication.
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, MBFIWrapper *MBFI, ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock * > *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr, SmallVectorImpl< MachineBasicBlock * > *CandidatePtr=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
TargetInstrInfo - Interface to description of machine instruction set.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
bool requiresStructuredCFG() const
Target-Independent Code Generator Pass Configuration Options.
bool getEnableTailMerge() const
CodeGenOptLevel getOptLevel() const
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetLowering * getTargetLowering() const
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:132
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
#define UINT64_MAX
Definition: DataTypes.h:77
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
double calcExtTspScore(ArrayRef< uint64_t > Order, ArrayRef< uint64_t > NodeSizes, ArrayRef< uint64_t > NodeCounts, ArrayRef< EdgeCount > EdgeCounts)
Estimate the "quality" of a given node order in CFG.
std::vector< uint64_t > computeExtTspLayout(ArrayRef< uint64_t > NodeSizes, ArrayRef< uint64_t > NodeCounts, ArrayRef< EdgeCount > EdgeCounts)
Find a layout of nodes (basic blocks) of a given CFG optimizing jump locality and thus processor I-ca...
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:457
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
cl::opt< bool > ApplyExtTspWithoutProfile
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
cl::opt< unsigned > ProfileLikelyProb
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2059
cl::opt< std::string > ViewBlockFreqFuncName("view-bfi-func-name", cl::Hidden, cl::desc("The option to specify " "the name of the function " "whose CFG will be displayed."))
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
cl::opt< GVDAGType > ViewBlockLayoutWithBFI("view-block-layout-with-bfi", cl::Hidden, cl::desc("Pop up a window to show a dag displaying MBP layout and associated " "block frequencies of the CFG."), cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), clEnumValN(GVDT_Fraction, "fraction", "display a graph using the " "fractional block frequency representation."), clEnumValN(GVDT_Integer, "integer", "display a graph using the raw " "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available.")))
bool isFunctionInPrintList(StringRef FunctionName)
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
cl::opt< unsigned > StaticLikelyProb
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2051
Printable printBlockFreq(const BlockFrequencyInfo &BFI, BlockFrequency Freq)
Print the block frequency Freq relative to the current functions entry frequency.
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
cl::opt< bool > EnableExtTspBlockPlacement
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
void initializeMachineBlockPlacementPass(PassRegistry &)
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39