LLVM 20.0.0git
MachineBlockPlacement.cpp
Go to the documentation of this file.
1//===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements basic block placement transformations using the CFG
10// structure and branch probability estimates.
11//
12// The pass strives to preserve the structure of the CFG (that is, retain
13// a topological ordering of basic blocks) in the absence of a *strong* signal
14// to the contrary from probabilities. However, within the CFG structure, it
15// attempts to choose an ordering which favors placing more likely sequences of
16// blocks adjacent to each other.
17//
18// The algorithm works from the inner-most loop within a function outward, and
19// at each stage walks through the basic blocks, trying to coalesce them into
20// sequential chains where allowed by the CFG (or demanded by heavy
21// probabilities). Finally, it walks the blocks in topological order, and the
22// first time it reaches a chain of basic blocks, it schedules them in the
23// function in-order.
24//
25//===----------------------------------------------------------------------===//
26
27#include "BranchFolding.h"
28#include "llvm/ADT/ArrayRef.h"
29#include "llvm/ADT/DenseMap.h"
30#include "llvm/ADT/STLExtras.h"
31#include "llvm/ADT/SetVector.h"
34#include "llvm/ADT/Statistic.h"
51#include "llvm/IR/DebugLoc.h"
52#include "llvm/IR/Function.h"
53#include "llvm/IR/PrintPasses.h"
55#include "llvm/Pass.h"
62#include "llvm/Support/Debug.h"
66#include <algorithm>
67#include <cassert>
68#include <cstdint>
69#include <iterator>
70#include <memory>
71#include <string>
72#include <tuple>
73#include <utility>
74#include <vector>
75
76using namespace llvm;
77
78#define DEBUG_TYPE "block-placement"
79
80STATISTIC(NumCondBranches, "Number of conditional branches");
81STATISTIC(NumUncondBranches, "Number of unconditional branches");
82STATISTIC(CondBranchTakenFreq,
83 "Potential frequency of taking conditional branches");
84STATISTIC(UncondBranchTakenFreq,
85 "Potential frequency of taking unconditional branches");
86
88 "align-all-blocks",
89 cl::desc("Force the alignment of all blocks in the function in log2 format "
90 "(e.g 4 means align on 16B boundaries)."),
92
94 "align-all-nofallthru-blocks",
95 cl::desc("Force the alignment of all blocks that have no fall-through "
96 "predecessors (i.e. don't add nops that are executed). In log2 "
97 "format (e.g 4 means align on 16B boundaries)."),
99
101 "max-bytes-for-alignment",
102 cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
103 "alignment"),
104 cl::init(0), cl::Hidden);
105
106// FIXME: Find a good default for this flag and remove the flag.
108 "block-placement-exit-block-bias",
109 cl::desc("Block frequency percentage a loop exit block needs "
110 "over the original exit to be considered the new exit."),
111 cl::init(0), cl::Hidden);
112
113// Definition:
114// - Outlining: placement of a basic block outside the chain or hot path.
115
117 "loop-to-cold-block-ratio",
118 cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
119 "(frequency of block) is greater than this ratio"),
120 cl::init(5), cl::Hidden);
121
122static cl::opt<bool>
123 ForceLoopColdBlock("force-loop-cold-block",
124 cl::desc("Force outlining cold blocks from loops."),
125 cl::init(false), cl::Hidden);
126
127static cl::opt<bool>
128 PreciseRotationCost("precise-rotation-cost",
129 cl::desc("Model the cost of loop rotation more "
130 "precisely by using profile data."),
131 cl::init(false), cl::Hidden);
132
133static cl::opt<bool>
134 ForcePreciseRotationCost("force-precise-rotation-cost",
135 cl::desc("Force the use of precise cost "
136 "loop rotation strategy."),
137 cl::init(false), cl::Hidden);
138
140 "misfetch-cost",
141 cl::desc("Cost that models the probabilistic risk of an instruction "
142 "misfetch due to a jump comparing to falling through, whose cost "
143 "is zero."),
144 cl::init(1), cl::Hidden);
145
146static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
147 cl::desc("Cost of jump instructions."),
148 cl::init(1), cl::Hidden);
149static cl::opt<bool>
150 TailDupPlacement("tail-dup-placement",
151 cl::desc("Perform tail duplication during placement. "
152 "Creates more fallthrough opportunites in "
153 "outline branches."),
154 cl::init(true), cl::Hidden);
155
156static cl::opt<bool>
157 BranchFoldPlacement("branch-fold-placement",
158 cl::desc("Perform branch folding during placement. "
159 "Reduces code size."),
160 cl::init(true), cl::Hidden);
161
162// Heuristic for tail duplication.
164 "tail-dup-placement-threshold",
165 cl::desc("Instruction cutoff for tail duplication during layout. "
166 "Tail merging during layout is forced to have a threshold "
167 "that won't conflict."),
168 cl::init(2), cl::Hidden);
169
170// Heuristic for aggressive tail duplication.
172 "tail-dup-placement-aggressive-threshold",
173 cl::desc("Instruction cutoff for aggressive tail duplication during "
174 "layout. Used at -O3. Tail merging during layout is forced to "
175 "have a threshold that won't conflict."),
176 cl::init(4), cl::Hidden);
177
178// Heuristic for tail duplication.
180 "tail-dup-placement-penalty",
181 cl::desc(
182 "Cost penalty for blocks that can avoid breaking CFG by copying. "
183 "Copying can increase fallthrough, but it also increases icache "
184 "pressure. This parameter controls the penalty to account for that. "
185 "Percent as integer."),
186 cl::init(2), cl::Hidden);
187
188// Heuristic for tail duplication if profile count is used in cost model.
190 "tail-dup-profile-percent-threshold",
191 cl::desc("If profile count information is used in tail duplication cost "
192 "model, the gained fall through number from tail duplication "
193 "should be at least this percent of hot count."),
194 cl::init(50), cl::Hidden);
195
196// Heuristic for triangle chains.
198 "triangle-chain-count",
199 cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
200 "triangle tail duplication heuristic to kick in. 0 to disable."),
201 cl::init(2), cl::Hidden);
202
203// Use case: When block layout is visualized after MBP pass, the basic blocks
204// are labeled in layout order; meanwhile blocks could be numbered in a
205// different order. It's hard to map between the graph and pass output.
206// With this option on, the basic blocks are renumbered in function layout
207// order. For debugging only.
209 "renumber-blocks-before-view",
210 cl::desc(
211 "If true, basic blocks are re-numbered before MBP layout is printed "
212 "into a dot graph. Only used when a function is being printed."),
213 cl::init(false), cl::Hidden);
214
216 "ext-tsp-block-placement-max-blocks",
217 cl::desc("Maximum number of basic blocks in a function to run ext-TSP "
218 "block placement."),
219 cl::init(UINT_MAX), cl::Hidden);
220
221// Apply the ext-tsp algorithm minimizing the size of a binary.
222static cl::opt<bool>
223 ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden,
224 cl::desc("Use ext-tsp for size-aware block placement."));
225
226namespace llvm {
231
232// Internal option used to control BFI display only after MBP pass.
233// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
234// -view-block-layout-with-bfi=
236
237// Command line option to specify the name of the function for CFG dump
238// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
240} // namespace llvm
241
242namespace {
243
244class BlockChain;
245
246/// Type for our function-wide basic block -> block chain mapping.
247using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
248
249/// A chain of blocks which will be laid out contiguously.
250///
251/// This is the datastructure representing a chain of consecutive blocks that
252/// are profitable to layout together in order to maximize fallthrough
253/// probabilities and code locality. We also can use a block chain to represent
254/// a sequence of basic blocks which have some external (correctness)
255/// requirement for sequential layout.
256///
257/// Chains can be built around a single basic block and can be merged to grow
258/// them. They participate in a block-to-chain mapping, which is updated
259/// automatically as chains are merged together.
260class BlockChain {
261 /// The sequence of blocks belonging to this chain.
262 ///
263 /// This is the sequence of blocks for a particular chain. These will be laid
264 /// out in-order within the function.
266
267 /// A handle to the function-wide basic block to block chain mapping.
268 ///
269 /// This is retained in each block chain to simplify the computation of child
270 /// block chains for SCC-formation and iteration. We store the edges to child
271 /// basic blocks, and map them back to their associated chains using this
272 /// structure.
273 BlockToChainMapType &BlockToChain;
274
275public:
276 /// Construct a new BlockChain.
277 ///
278 /// This builds a new block chain representing a single basic block in the
279 /// function. It also registers itself as the chain that block participates
280 /// in with the BlockToChain mapping.
281 BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
282 : Blocks(1, BB), BlockToChain(BlockToChain) {
283 assert(BB && "Cannot create a chain with a null basic block");
284 BlockToChain[BB] = this;
285 }
286
287 /// Iterator over blocks within the chain.
290
291 /// Beginning of blocks within the chain.
292 iterator begin() { return Blocks.begin(); }
293 const_iterator begin() const { return Blocks.begin(); }
294
295 /// End of blocks within the chain.
296 iterator end() { return Blocks.end(); }
297 const_iterator end() const { return Blocks.end(); }
298
299 bool remove(MachineBasicBlock *BB) {
300 for (iterator i = begin(); i != end(); ++i) {
301 if (*i == BB) {
302 Blocks.erase(i);
303 return true;
304 }
305 }
306 return false;
307 }
308
309 /// Merge a block chain into this one.
310 ///
311 /// This routine merges a block chain into this one. It takes care of forming
312 /// a contiguous sequence of basic blocks, updating the edge list, and
313 /// updating the block -> chain mapping. It does not free or tear down the
314 /// old chain, but the old chain's block list is no longer valid.
315 void merge(MachineBasicBlock *BB, BlockChain *Chain) {
316 assert(BB && "Can't merge a null block.");
317 assert(!Blocks.empty() && "Can't merge into an empty chain.");
318
319 // Fast path in case we don't have a chain already.
320 if (!Chain) {
321 assert(!BlockToChain[BB] &&
322 "Passed chain is null, but BB has entry in BlockToChain.");
323 Blocks.push_back(BB);
324 BlockToChain[BB] = this;
325 return;
326 }
327
328 assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
329 assert(Chain->begin() != Chain->end());
330
331 // Update the incoming blocks to point to this chain, and add them to the
332 // chain structure.
333 for (MachineBasicBlock *ChainBB : *Chain) {
334 Blocks.push_back(ChainBB);
335 assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
336 BlockToChain[ChainBB] = this;
337 }
338 }
339
340#ifndef NDEBUG
341 /// Dump the blocks in this chain.
342 LLVM_DUMP_METHOD void dump() {
343 for (MachineBasicBlock *MBB : *this)
344 MBB->dump();
345 }
346#endif // NDEBUG
347
348 /// Count of predecessors of any block within the chain which have not
349 /// yet been scheduled. In general, we will delay scheduling this chain
350 /// until those predecessors are scheduled (or we find a sufficiently good
351 /// reason to override this heuristic.) Note that when forming loop chains,
352 /// blocks outside the loop are ignored and treated as if they were already
353 /// scheduled.
354 ///
355 /// Note: This field is reinitialized multiple times - once for each loop,
356 /// and then once for the function as a whole.
357 unsigned UnscheduledPredecessors = 0;
358};
359
360class MachineBlockPlacement : public MachineFunctionPass {
361 /// A type for a block filter set.
363
364 /// Pair struct containing basic block and taildup profitability
365 struct BlockAndTailDupResult {
366 MachineBasicBlock *BB = nullptr;
367 bool ShouldTailDup;
368 };
369
370 /// Triple struct containing edge weight and the edge.
371 struct WeightedEdge {
372 BlockFrequency Weight;
373 MachineBasicBlock *Src = nullptr;
374 MachineBasicBlock *Dest = nullptr;
375 };
376
377 /// work lists of blocks that are ready to be laid out
380
381 /// Edges that have already been computed as optimal.
383
384 /// Machine Function
385 MachineFunction *F = nullptr;
386
387 /// A handle to the branch probability pass.
388 const MachineBranchProbabilityInfo *MBPI = nullptr;
389
390 /// A handle to the function-wide block frequency pass.
391 std::unique_ptr<MBFIWrapper> MBFI;
392
393 /// A handle to the loop info.
394 MachineLoopInfo *MLI = nullptr;
395
396 /// Preferred loop exit.
397 /// Member variable for convenience. It may be removed by duplication deep
398 /// in the call stack.
399 MachineBasicBlock *PreferredLoopExit = nullptr;
400
401 /// A handle to the target's instruction info.
402 const TargetInstrInfo *TII = nullptr;
403
404 /// A handle to the target's lowering info.
405 const TargetLoweringBase *TLI = nullptr;
406
407 /// A handle to the post dominator tree.
408 MachinePostDominatorTree *MPDT = nullptr;
409
410 ProfileSummaryInfo *PSI = nullptr;
411
412 TargetPassConfig *PassConfig = nullptr;
413
414 /// Duplicator used to duplicate tails during placement.
415 ///
416 /// Placement decisions can open up new tail duplication opportunities, but
417 /// since tail duplication affects placement decisions of later blocks, it
418 /// must be done inline.
419 TailDuplicator TailDup;
420
421 /// Partial tail duplication threshold.
422 BlockFrequency DupThreshold;
423
424 unsigned TailDupSize;
425
426 /// True: use block profile count to compute tail duplication cost.
427 /// False: use block frequency to compute tail duplication cost.
428 bool UseProfileCount = false;
429
430 /// Allocator and owner of BlockChain structures.
431 ///
432 /// We build BlockChains lazily while processing the loop structure of
433 /// a function. To reduce malloc traffic, we allocate them using this
434 /// slab-like allocator, and destroy them after the pass completes. An
435 /// important guarantee is that this allocator produces stable pointers to
436 /// the chains.
438
439 /// Function wide BasicBlock to BlockChain mapping.
440 ///
441 /// This mapping allows efficiently moving from any given basic block to the
442 /// BlockChain it participates in, if any. We use it to, among other things,
443 /// allow implicitly defining edges between chains as the existing edges
444 /// between basic blocks.
446
447#ifndef NDEBUG
448 /// The set of basic blocks that have terminators that cannot be fully
449 /// analyzed. These basic blocks cannot be re-ordered safely by
450 /// MachineBlockPlacement, and we must preserve physical layout of these
451 /// blocks and their successors through the pass.
452 SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
453#endif
454
455 /// Get block profile count or frequency according to UseProfileCount.
456 /// The return value is used to model tail duplication cost.
457 BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
458 if (UseProfileCount) {
459 auto Count = MBFI->getBlockProfileCount(BB);
460 if (Count)
461 return BlockFrequency(*Count);
462 else
463 return BlockFrequency(0);
464 } else
465 return MBFI->getBlockFreq(BB);
466 }
467
468 /// Scale the DupThreshold according to basic block size.
469 BlockFrequency scaleThreshold(MachineBasicBlock *BB);
470 void initTailDupThreshold();
471
472 /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
473 /// if the count goes to 0, add them to the appropriate work list.
474 void markChainSuccessors(const BlockChain &Chain,
475 const MachineBasicBlock *LoopHeaderBB,
476 const BlockFilterSet *BlockFilter = nullptr);
477
478 /// Decrease the UnscheduledPredecessors count for a single block, and
479 /// if the count goes to 0, add them to the appropriate work list.
480 void markBlockSuccessors(const BlockChain &Chain, const MachineBasicBlock *BB,
481 const MachineBasicBlock *LoopHeaderBB,
482 const BlockFilterSet *BlockFilter = nullptr);
483
485 collectViableSuccessors(const MachineBasicBlock *BB, const BlockChain &Chain,
486 const BlockFilterSet *BlockFilter,
488 bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
489 BlockFilterSet *BlockFilter);
490 void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
492 BlockFilterSet *BlockFilter);
493 bool repeatedlyTailDuplicateBlock(
495 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
496 BlockFilterSet *BlockFilter,
497 MachineFunction::iterator &PrevUnplacedBlockIt,
498 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt);
499 bool
500 maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred,
501 BlockChain &Chain, BlockFilterSet *BlockFilter,
502 MachineFunction::iterator &PrevUnplacedBlockIt,
503 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
504 bool &DuplicatedToLPred);
505 bool hasBetterLayoutPredecessor(const MachineBasicBlock *BB,
506 const MachineBasicBlock *Succ,
507 const BlockChain &SuccChain,
508 BranchProbability SuccProb,
509 BranchProbability RealSuccProb,
510 const BlockChain &Chain,
511 const BlockFilterSet *BlockFilter);
512 BlockAndTailDupResult selectBestSuccessor(const MachineBasicBlock *BB,
513 const BlockChain &Chain,
514 const BlockFilterSet *BlockFilter);
516 selectBestCandidateBlock(const BlockChain &Chain,
519 getFirstUnplacedBlock(const BlockChain &PlacedChain,
520 MachineFunction::iterator &PrevUnplacedBlockIt);
522 getFirstUnplacedBlock(const BlockChain &PlacedChain,
523 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
524 const BlockFilterSet *BlockFilter);
525
526 /// Add a basic block to the work list if it is appropriate.
527 ///
528 /// If the optional parameter BlockFilter is provided, only MBB
529 /// present in the set will be added to the worklist. If nullptr
530 /// is provided, no filtering occurs.
531 void fillWorkLists(const MachineBasicBlock *MBB,
532 SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
533 const BlockFilterSet *BlockFilter);
534
535 void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
536 BlockFilterSet *BlockFilter = nullptr);
537 bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
538 const MachineBasicBlock *OldTop);
539 bool hasViableTopFallthrough(const MachineBasicBlock *Top,
540 const BlockFilterSet &LoopBlockSet);
541 BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
542 const BlockFilterSet &LoopBlockSet);
543 BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
544 const MachineBasicBlock *OldTop,
545 const MachineBasicBlock *ExitBB,
546 const BlockFilterSet &LoopBlockSet);
547 MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
548 const MachineLoop &L,
549 const BlockFilterSet &LoopBlockSet);
550 MachineBasicBlock *findBestLoopTop(const MachineLoop &L,
551 const BlockFilterSet &LoopBlockSet);
552 MachineBasicBlock *findBestLoopExit(const MachineLoop &L,
553 const BlockFilterSet &LoopBlockSet,
554 BlockFrequency &ExitFreq);
555 BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
556 void buildLoopChains(const MachineLoop &L);
557 void rotateLoop(BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
558 BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
559 void rotateLoopWithProfile(BlockChain &LoopChain, const MachineLoop &L,
560 const BlockFilterSet &LoopBlockSet);
561 void buildCFGChains();
562 void optimizeBranches();
563 void alignBlocks();
564 /// Returns true if a block should be tail-duplicated to increase fallthrough
565 /// opportunities.
566 bool shouldTailDuplicate(MachineBasicBlock *BB);
567 /// Check the edge frequencies to see if tail duplication will increase
568 /// fallthroughs.
569 bool isProfitableToTailDup(const MachineBasicBlock *BB,
570 const MachineBasicBlock *Succ,
571 BranchProbability QProb, const BlockChain &Chain,
572 const BlockFilterSet *BlockFilter);
573
574 /// Check for a trellis layout.
575 bool isTrellis(const MachineBasicBlock *BB,
576 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
577 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
578
579 /// Get the best successor given a trellis layout.
580 BlockAndTailDupResult getBestTrellisSuccessor(
581 const MachineBasicBlock *BB,
582 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
583 BranchProbability AdjustedSumProb, const BlockChain &Chain,
584 const BlockFilterSet *BlockFilter);
585
586 /// Get the best pair of non-conflicting edges.
587 static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
588 const MachineBasicBlock *BB,
590
591 /// Returns true if a block can tail duplicate into all unplaced
592 /// predecessors. Filters based on loop.
593 bool canTailDuplicateUnplacedPreds(const MachineBasicBlock *BB,
594 MachineBasicBlock *Succ,
595 const BlockChain &Chain,
596 const BlockFilterSet *BlockFilter);
597
598 /// Find chains of triangles to tail-duplicate where a global analysis works,
599 /// but a local analysis would not find them.
600 void precomputeTriangleChains();
601
602 /// Apply a post-processing step optimizing block placement.
603 void applyExtTsp(bool OptForSize);
604
605 /// Modify the existing block placement in the function and adjust all jumps.
606 void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
607
608 /// Create a single CFG chain from the current block order.
609 void createCFGChainExtTsp();
610
611public:
612 static char ID; // Pass identification, replacement for typeid
613
614 MachineBlockPlacement() : MachineFunctionPass(ID) {
616 }
617
618 bool runOnMachineFunction(MachineFunction &F) override;
619
620 bool allowTailDupPlacement() const {
621 assert(F);
622 return TailDupPlacement && !F->getTarget().requiresStructuredCFG();
623 }
624
625 void getAnalysisUsage(AnalysisUsage &AU) const override {
634 }
635};
636
637} // end anonymous namespace
638
639char MachineBlockPlacement::ID = 0;
640
641char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID;
642
643INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
644 "Branch Probability Basic Block Placement", false, false)
650INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
651 "Branch Probability Basic Block Placement", false, false)
652
653#ifndef NDEBUG
654/// Helper to print the name of a MBB.
655///
656/// Only used by debug logging.
657static std::string getBlockName(const MachineBasicBlock *BB) {
658 std::string Result;
659 raw_string_ostream OS(Result);
660 OS << printMBBReference(*BB);
661 OS << " ('" << BB->getName() << "')";
662 OS.flush();
663 return Result;
664}
665#endif
666
667/// Mark a chain's successors as having one fewer preds.
668///
669/// When a chain is being merged into the "placed" chain, this routine will
670/// quickly walk the successors of each block in the chain and mark them as
671/// having one fewer active predecessor. It also adds any successors of this
672/// chain which reach the zero-predecessor state to the appropriate worklist.
673void MachineBlockPlacement::markChainSuccessors(
674 const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
675 const BlockFilterSet *BlockFilter) {
676 // Walk all the blocks in this chain, marking their successors as having
677 // a predecessor placed.
678 for (MachineBasicBlock *MBB : Chain) {
679 markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
680 }
681}
682
683/// Mark a single block's successors as having one fewer preds.
684///
685/// Under normal circumstances, this is only called by markChainSuccessors,
686/// but if a block that was to be placed is completely tail-duplicated away,
687/// and was duplicated into the chain end, we need to redo markBlockSuccessors
688/// for just that block.
689void MachineBlockPlacement::markBlockSuccessors(
690 const BlockChain &Chain, const MachineBasicBlock *MBB,
691 const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
692 // Add any successors for which this is the only un-placed in-loop
693 // predecessor to the worklist as a viable candidate for CFG-neutral
694 // placement. No subsequent placement of this block will violate the CFG
695 // shape, so we get to use heuristics to choose a favorable placement.
696 for (MachineBasicBlock *Succ : MBB->successors()) {
697 if (BlockFilter && !BlockFilter->count(Succ))
698 continue;
699 BlockChain &SuccChain = *BlockToChain[Succ];
700 // Disregard edges within a fixed chain, or edges to the loop header.
701 if (&Chain == &SuccChain || Succ == LoopHeaderBB)
702 continue;
703
704 // This is a cross-chain edge that is within the loop, so decrement the
705 // loop predecessor count of the destination chain.
706 if (SuccChain.UnscheduledPredecessors == 0 ||
707 --SuccChain.UnscheduledPredecessors > 0)
708 continue;
709
710 auto *NewBB = *SuccChain.begin();
711 if (NewBB->isEHPad())
712 EHPadWorkList.push_back(NewBB);
713 else
714 BlockWorkList.push_back(NewBB);
715 }
716}
717
718/// This helper function collects the set of successors of block
719/// \p BB that are allowed to be its layout successors, and return
720/// the total branch probability of edges from \p BB to those
721/// blocks.
722BranchProbability MachineBlockPlacement::collectViableSuccessors(
723 const MachineBasicBlock *BB, const BlockChain &Chain,
724 const BlockFilterSet *BlockFilter,
726 // Adjust edge probabilities by excluding edges pointing to blocks that is
727 // either not in BlockFilter or is already in the current chain. Consider the
728 // following CFG:
729 //
730 // --->A
731 // | / \
732 // | B C
733 // | \ / \
734 // ----D E
735 //
736 // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
737 // A->C is chosen as a fall-through, D won't be selected as a successor of C
738 // due to CFG constraint (the probability of C->D is not greater than
739 // HotProb to break topo-order). If we exclude E that is not in BlockFilter
740 // when calculating the probability of C->D, D will be selected and we
741 // will get A C D B as the layout of this loop.
742 auto AdjustedSumProb = BranchProbability::getOne();
743 for (MachineBasicBlock *Succ : BB->successors()) {
744 bool SkipSucc = false;
745 if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
746 SkipSucc = true;
747 } else {
748 BlockChain *SuccChain = BlockToChain[Succ];
749 if (SuccChain == &Chain) {
750 SkipSucc = true;
751 } else if (Succ != *SuccChain->begin()) {
752 LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
753 << " -> Mid chain!\n");
754 continue;
755 }
756 }
757 if (SkipSucc)
758 AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
759 else
760 Successors.push_back(Succ);
761 }
762
763 return AdjustedSumProb;
764}
765
766/// The helper function returns the branch probability that is adjusted
767/// or normalized over the new total \p AdjustedSumProb.
770 BranchProbability AdjustedSumProb) {
771 BranchProbability SuccProb;
772 uint32_t SuccProbN = OrigProb.getNumerator();
773 uint32_t SuccProbD = AdjustedSumProb.getNumerator();
774 if (SuccProbN >= SuccProbD)
775 SuccProb = BranchProbability::getOne();
776 else
777 SuccProb = BranchProbability(SuccProbN, SuccProbD);
778
779 return SuccProb;
780}
781
782/// Check if \p BB has exactly the successors in \p Successors.
783static bool
786 if (BB.succ_size() != Successors.size())
787 return false;
788 // We don't want to count self-loops
789 if (Successors.count(&BB))
790 return false;
791 for (MachineBasicBlock *Succ : BB.successors())
792 if (!Successors.count(Succ))
793 return false;
794 return true;
795}
796
797/// Check if a block should be tail duplicated to increase fallthrough
798/// opportunities.
799/// \p BB Block to check.
800bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
801 // Blocks with single successors don't create additional fallthrough
802 // opportunities. Don't duplicate them. TODO: When conditional exits are
803 // analyzable, allow them to be duplicated.
804 bool IsSimple = TailDup.isSimpleBB(BB);
805
806 if (BB->succ_size() == 1)
807 return false;
808 return TailDup.shouldTailDuplicate(IsSimple, *BB);
809}
810
811/// Compare 2 BlockFrequency's with a small penalty for \p A.
812/// In order to be conservative, we apply a X% penalty to account for
813/// increased icache pressure and static heuristics. For small frequencies
814/// we use only the numerators to improve accuracy. For simplicity, we assume
815/// the penalty is less than 100%
816/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
818 BlockFrequency EntryFreq) {
819 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
820 BlockFrequency Gain = A - B;
821 return (Gain / ThresholdProb) >= EntryFreq;
822}
823
824/// Check the edge frequencies to see if tail duplication will increase
825/// fallthroughs. It only makes sense to call this function when
826/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
827/// always locally profitable if we would have picked \p Succ without
828/// considering duplication.
829bool MachineBlockPlacement::isProfitableToTailDup(
830 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
831 BranchProbability QProb, const BlockChain &Chain,
832 const BlockFilterSet *BlockFilter) {
833 // We need to do a probability calculation to make sure this is profitable.
834 // First: does succ have a successor that post-dominates? This affects the
835 // calculation. The 2 relevant cases are:
836 // BB BB
837 // | \Qout | \Qout
838 // P| C |P C
839 // = C' = C'
840 // | /Qin | /Qin
841 // | / | /
842 // Succ Succ
843 // / \ | \ V
844 // U/ =V |U \
845 // / \ = D
846 // D E | /
847 // | /
848 // |/
849 // PDom
850 // '=' : Branch taken for that CFG edge
851 // In the second case, Placing Succ while duplicating it into C prevents the
852 // fallthrough of Succ into either D or PDom, because they now have C as an
853 // unplaced predecessor
854
855 // Start by figuring out which case we fall into
856 MachineBasicBlock *PDom = nullptr;
858 // Only scan the relevant successors
859 auto AdjustedSuccSumProb =
860 collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
861 BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
862 auto BBFreq = MBFI->getBlockFreq(BB);
863 auto SuccFreq = MBFI->getBlockFreq(Succ);
864 BlockFrequency P = BBFreq * PProb;
865 BlockFrequency Qout = BBFreq * QProb;
866 BlockFrequency EntryFreq = MBFI->getEntryFreq();
867 // If there are no more successors, it is profitable to copy, as it strictly
868 // increases fallthrough.
869 if (SuccSuccs.size() == 0)
870 return greaterWithBias(P, Qout, EntryFreq);
871
872 auto BestSuccSucc = BranchProbability::getZero();
873 // Find the PDom or the best Succ if no PDom exists.
874 for (MachineBasicBlock *SuccSucc : SuccSuccs) {
875 auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
876 if (Prob > BestSuccSucc)
877 BestSuccSucc = Prob;
878 if (PDom == nullptr)
879 if (MPDT->dominates(SuccSucc, Succ)) {
880 PDom = SuccSucc;
881 break;
882 }
883 }
884 // For the comparisons, we need to know Succ's best incoming edge that isn't
885 // from BB.
886 auto SuccBestPred = BlockFrequency(0);
887 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
888 if (SuccPred == Succ || SuccPred == BB ||
889 BlockToChain[SuccPred] == &Chain ||
890 (BlockFilter && !BlockFilter->count(SuccPred)))
891 continue;
892 auto Freq =
893 MBFI->getBlockFreq(SuccPred) * MBPI->getEdgeProbability(SuccPred, Succ);
894 if (Freq > SuccBestPred)
895 SuccBestPred = Freq;
896 }
897 // Qin is Succ's best unplaced incoming edge that isn't BB
898 BlockFrequency Qin = SuccBestPred;
899 // If it doesn't have a post-dominating successor, here is the calculation:
900 // BB BB
901 // | \Qout | \
902 // P| C | =
903 // = C' | C
904 // | /Qin | |
905 // | / | C' (+Succ)
906 // Succ Succ /|
907 // / \ | \/ |
908 // U/ =V | == |
909 // / \ | / \|
910 // D E D E
911 // '=' : Branch taken for that CFG edge
912 // Cost in the first case is: P + V
913 // For this calculation, we always assume P > Qout. If Qout > P
914 // The result of this function will be ignored at the caller.
915 // Let F = SuccFreq - Qin
916 // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
917
918 if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
919 BranchProbability UProb = BestSuccSucc;
920 BranchProbability VProb = AdjustedSuccSumProb - UProb;
921 BlockFrequency F = SuccFreq - Qin;
922 BlockFrequency V = SuccFreq * VProb;
923 BlockFrequency QinU = std::min(Qin, F) * UProb;
924 BlockFrequency BaseCost = P + V;
925 BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
926 return greaterWithBias(BaseCost, DupCost, EntryFreq);
927 }
928 BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
929 BranchProbability VProb = AdjustedSuccSumProb - UProb;
930 BlockFrequency U = SuccFreq * UProb;
931 BlockFrequency V = SuccFreq * VProb;
932 BlockFrequency F = SuccFreq - Qin;
933 // If there is a post-dominating successor, here is the calculation:
934 // BB BB BB BB
935 // | \Qout | \ | \Qout | \
936 // |P C | = |P C | =
937 // = C' |P C = C' |P C
938 // | /Qin | | | /Qin | |
939 // | / | C' (+Succ) | / | C' (+Succ)
940 // Succ Succ /| Succ Succ /|
941 // | \ V | \/ | | \ V | \/ |
942 // |U \ |U /\ =? |U = |U /\ |
943 // = D = = =?| | D | = =|
944 // | / |/ D | / |/ D
945 // | / | / | = | /
946 // |/ | / |/ | =
947 // Dom Dom Dom Dom
948 // '=' : Branch taken for that CFG edge
949 // The cost for taken branches in the first case is P + U
950 // Let F = SuccFreq - Qin
951 // The cost in the second case (assuming independence), given the layout:
952 // BB, Succ, (C+Succ), D, Dom or the layout:
953 // BB, Succ, D, Dom, (C+Succ)
954 // is Qout + max(F, Qin) * U + min(F, Qin)
955 // compare P + U vs Qout + P * U + Qin.
956 //
957 // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
958 //
959 // For the 3rd case, the cost is P + 2 * V
960 // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
961 // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
962 if (UProb > AdjustedSuccSumProb / 2 &&
963 !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
964 Chain, BlockFilter))
965 // Cases 3 & 4
966 return greaterWithBias(
967 (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
968 EntryFreq);
969 // Cases 1 & 2
970 return greaterWithBias((P + U),
971 (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
972 std::max(Qin, F) * UProb),
973 EntryFreq);
974}
975
976/// Check for a trellis layout. \p BB is the upper part of a trellis if its
977/// successors form the lower part of a trellis. A successor set S forms the
978/// lower part of a trellis if all of the predecessors of S are either in S or
979/// have all of S as successors. We ignore trellises where BB doesn't have 2
980/// successors because for fewer than 2, it's trivial, and for 3 or greater they
981/// are very uncommon and complex to compute optimally. Allowing edges within S
982/// is not strictly a trellis, but the same algorithm works, so we allow it.
983bool MachineBlockPlacement::isTrellis(
984 const MachineBasicBlock *BB,
985 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
986 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
987 // Technically BB could form a trellis with branching factor higher than 2.
988 // But that's extremely uncommon.
989 if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
990 return false;
991
993 BB->succ_end());
994 // To avoid reviewing the same predecessors twice.
996
997 for (MachineBasicBlock *Succ : ViableSuccs) {
998 int PredCount = 0;
999 for (auto *SuccPred : Succ->predecessors()) {
1000 // Allow triangle successors, but don't count them.
1001 if (Successors.count(SuccPred)) {
1002 // Make sure that it is actually a triangle.
1003 for (MachineBasicBlock *CheckSucc : SuccPred->successors())
1004 if (!Successors.count(CheckSucc))
1005 return false;
1006 continue;
1007 }
1008 const BlockChain *PredChain = BlockToChain[SuccPred];
1009 if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
1010 PredChain == &Chain || PredChain == BlockToChain[Succ])
1011 continue;
1012 ++PredCount;
1013 // Perform the successor check only once.
1014 if (!SeenPreds.insert(SuccPred).second)
1015 continue;
1016 if (!hasSameSuccessors(*SuccPred, Successors))
1017 return false;
1018 }
1019 // If one of the successors has only BB as a predecessor, it is not a
1020 // trellis.
1021 if (PredCount < 1)
1022 return false;
1023 }
1024 return true;
1025}
1026
1027/// Pick the highest total weight pair of edges that can both be laid out.
1028/// The edges in \p Edges[0] are assumed to have a different destination than
1029/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
1030/// the individual highest weight edges to the 2 different destinations, or in
1031/// case of a conflict, one of them should be replaced with a 2nd best edge.
1032std::pair<MachineBlockPlacement::WeightedEdge,
1033 MachineBlockPlacement::WeightedEdge>
1034MachineBlockPlacement::getBestNonConflictingEdges(
1035 const MachineBasicBlock *BB,
1037 Edges) {
1038 // Sort the edges, and then for each successor, find the best incoming
1039 // predecessor. If the best incoming predecessors aren't the same,
1040 // then that is clearly the best layout. If there is a conflict, one of the
1041 // successors will have to fallthrough from the second best predecessor. We
1042 // compare which combination is better overall.
1043
1044 // Sort for highest frequency.
1045 auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
1046
1047 llvm::stable_sort(Edges[0], Cmp);
1048 llvm::stable_sort(Edges[1], Cmp);
1049 auto BestA = Edges[0].begin();
1050 auto BestB = Edges[1].begin();
1051 // Arrange for the correct answer to be in BestA and BestB
1052 // If the 2 best edges don't conflict, the answer is already there.
1053 if (BestA->Src == BestB->Src) {
1054 // Compare the total fallthrough of (Best + Second Best) for both pairs
1055 auto SecondBestA = std::next(BestA);
1056 auto SecondBestB = std::next(BestB);
1057 BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
1058 BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
1059 if (BestAScore < BestBScore)
1060 BestA = SecondBestA;
1061 else
1062 BestB = SecondBestB;
1063 }
1064 // Arrange for the BB edge to be in BestA if it exists.
1065 if (BestB->Src == BB)
1066 std::swap(BestA, BestB);
1067 return std::make_pair(*BestA, *BestB);
1068}
1069
1070/// Get the best successor from \p BB based on \p BB being part of a trellis.
1071/// We only handle trellises with 2 successors, so the algorithm is
1072/// straightforward: Find the best pair of edges that don't conflict. We find
1073/// the best incoming edge for each successor in the trellis. If those conflict,
1074/// we consider which of them should be replaced with the second best.
1075/// Upon return the two best edges will be in \p BestEdges. If one of the edges
1076/// comes from \p BB, it will be in \p BestEdges[0]
1077MachineBlockPlacement::BlockAndTailDupResult
1078MachineBlockPlacement::getBestTrellisSuccessor(
1079 const MachineBasicBlock *BB,
1080 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1081 BranchProbability AdjustedSumProb, const BlockChain &Chain,
1082 const BlockFilterSet *BlockFilter) {
1083
1084 BlockAndTailDupResult Result = {nullptr, false};
1086 BB->succ_end());
1087
1088 // We assume size 2 because it's common. For general n, we would have to do
1089 // the Hungarian algorithm, but it's not worth the complexity because more
1090 // than 2 successors is fairly uncommon, and a trellis even more so.
1091 if (Successors.size() != 2 || ViableSuccs.size() != 2)
1092 return Result;
1093
1094 // Collect the edge frequencies of all edges that form the trellis.
1096 int SuccIndex = 0;
1097 for (auto *Succ : ViableSuccs) {
1098 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1099 // Skip any placed predecessors that are not BB
1100 if (SuccPred != BB)
1101 if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
1102 BlockToChain[SuccPred] == &Chain ||
1103 BlockToChain[SuccPred] == BlockToChain[Succ])
1104 continue;
1105 BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1106 MBPI->getEdgeProbability(SuccPred, Succ);
1107 Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1108 }
1109 ++SuccIndex;
1110 }
1111
1112 // Pick the best combination of 2 edges from all the edges in the trellis.
1113 WeightedEdge BestA, BestB;
1114 std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1115
1116 if (BestA.Src != BB) {
1117 // If we have a trellis, and BB doesn't have the best fallthrough edges,
1118 // we shouldn't choose any successor. We've already looked and there's a
1119 // better fallthrough edge for all the successors.
1120 LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1121 return Result;
1122 }
1123
1124 // Did we pick the triangle edge? If tail-duplication is profitable, do
1125 // that instead. Otherwise merge the triangle edge now while we know it is
1126 // optimal.
1127 if (BestA.Dest == BestB.Src) {
1128 // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1129 // would be better.
1130 MachineBasicBlock *Succ1 = BestA.Dest;
1131 MachineBasicBlock *Succ2 = BestB.Dest;
1132 // Check to see if tail-duplication would be profitable.
1133 if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
1134 canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1135 isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1136 Chain, BlockFilter)) {
1138 MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1139 dbgs() << " Selected: " << getBlockName(Succ2)
1140 << ", probability: " << Succ2Prob
1141 << " (Tail Duplicate)\n");
1142 Result.BB = Succ2;
1143 Result.ShouldTailDup = true;
1144 return Result;
1145 }
1146 }
1147 // We have already computed the optimal edge for the other side of the
1148 // trellis.
1149 ComputedEdges[BestB.Src] = {BestB.Dest, false};
1150
1151 auto TrellisSucc = BestA.Dest;
1153 MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1154 dbgs() << " Selected: " << getBlockName(TrellisSucc)
1155 << ", probability: " << SuccProb << " (Trellis)\n");
1156 Result.BB = TrellisSucc;
1157 return Result;
1158}
1159
1160/// When the option allowTailDupPlacement() is on, this method checks if the
1161/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1162/// into all of its unplaced, unfiltered predecessors, that are not BB.
1163bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1164 const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1165 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1166 if (!shouldTailDuplicate(Succ))
1167 return false;
1168
1169 // The result of canTailDuplicate.
1170 bool Duplicate = true;
1171 // Number of possible duplication.
1172 unsigned int NumDup = 0;
1173
1174 // For CFG checking.
1176 BB->succ_end());
1177 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1178 // Make sure all unplaced and unfiltered predecessors can be
1179 // tail-duplicated into.
1180 // Skip any blocks that are already placed or not in this loop.
1181 if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) ||
1182 (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
1183 continue;
1184 if (!TailDup.canTailDuplicate(Succ, Pred)) {
1185 if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1186 // This will result in a trellis after tail duplication, so we don't
1187 // need to copy Succ into this predecessor. In the presence
1188 // of a trellis tail duplication can continue to be profitable.
1189 // For example:
1190 // A A
1191 // |\ |\
1192 // | \ | \
1193 // | C | C+BB
1194 // | / | |
1195 // |/ | |
1196 // BB => BB |
1197 // |\ |\/|
1198 // | \ |/\|
1199 // | D | D
1200 // | / | /
1201 // |/ |/
1202 // Succ Succ
1203 //
1204 // After BB was duplicated into C, the layout looks like the one on the
1205 // right. BB and C now have the same successors. When considering
1206 // whether Succ can be duplicated into all its unplaced predecessors, we
1207 // ignore C.
1208 // We can do this because C already has a profitable fallthrough, namely
1209 // D. TODO(iteratee): ignore sufficiently cold predecessors for
1210 // duplication and for this test.
1211 //
1212 // This allows trellises to be laid out in 2 separate chains
1213 // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1214 // because it allows the creation of 2 fallthrough paths with links
1215 // between them, and we correctly identify the best layout for these
1216 // CFGs. We want to extend trellises that the user created in addition
1217 // to trellises created by tail-duplication, so we just look for the
1218 // CFG.
1219 continue;
1220 Duplicate = false;
1221 continue;
1222 }
1223 NumDup++;
1224 }
1225
1226 // No possible duplication in current filter set.
1227 if (NumDup == 0)
1228 return false;
1229
1230 // If profile information is available, findDuplicateCandidates can do more
1231 // precise benefit analysis.
1232 if (F->getFunction().hasProfileData())
1233 return true;
1234
1235 // This is mainly for function exit BB.
1236 // The integrated tail duplication is really designed for increasing
1237 // fallthrough from predecessors from Succ to its successors. We may need
1238 // other machanism to handle different cases.
1239 if (Succ->succ_empty())
1240 return true;
1241
1242 // Plus the already placed predecessor.
1243 NumDup++;
1244
1245 // If the duplication candidate has more unplaced predecessors than
1246 // successors, the extra duplication can't bring more fallthrough.
1247 //
1248 // Pred1 Pred2 Pred3
1249 // \ | /
1250 // \ | /
1251 // \ | /
1252 // Dup
1253 // / \
1254 // / \
1255 // Succ1 Succ2
1256 //
1257 // In this example Dup has 2 successors and 3 predecessors, duplication of Dup
1258 // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
1259 // but the duplication into Pred3 can't increase fallthrough.
1260 //
1261 // A small number of extra duplication may not hurt too much. We need a better
1262 // heuristic to handle it.
1263 if ((NumDup > Succ->succ_size()) || !Duplicate)
1264 return false;
1265
1266 return true;
1267}
1268
1269/// Find chains of triangles where we believe it would be profitable to
1270/// tail-duplicate them all, but a local analysis would not find them.
1271/// There are 3 ways this can be profitable:
1272/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1273/// longer chains)
1274/// 2) The chains are statically correlated. Branch probabilities have a very
1275/// U-shaped distribution.
1276/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1277/// If the branches in a chain are likely to be from the same side of the
1278/// distribution as their predecessor, but are independent at runtime, this
1279/// transformation is profitable. (Because the cost of being wrong is a small
1280/// fixed cost, unlike the standard triangle layout where the cost of being
1281/// wrong scales with the # of triangles.)
1282/// 3) The chains are dynamically correlated. If the probability that a previous
1283/// branch was taken positively influences whether the next branch will be
1284/// taken
1285/// We believe that 2 and 3 are common enough to justify the small margin in 1.
1286void MachineBlockPlacement::precomputeTriangleChains() {
1287 struct TriangleChain {
1288 std::vector<MachineBasicBlock *> Edges;
1289
1290 TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1291 : Edges({src, dst}) {}
1292
1293 void append(MachineBasicBlock *dst) {
1294 assert(getKey()->isSuccessor(dst) &&
1295 "Attempting to append a block that is not a successor.");
1296 Edges.push_back(dst);
1297 }
1298
1299 unsigned count() const { return Edges.size() - 1; }
1300
1301 MachineBasicBlock *getKey() const { return Edges.back(); }
1302 };
1303
1304 if (TriangleChainCount == 0)
1305 return;
1306
1307 LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1308 // Map from last block to the chain that contains it. This allows us to extend
1309 // chains as we find new triangles.
1311 for (MachineBasicBlock &BB : *F) {
1312 // If BB doesn't have 2 successors, it doesn't start a triangle.
1313 if (BB.succ_size() != 2)
1314 continue;
1315 MachineBasicBlock *PDom = nullptr;
1316 for (MachineBasicBlock *Succ : BB.successors()) {
1317 if (!MPDT->dominates(Succ, &BB))
1318 continue;
1319 PDom = Succ;
1320 break;
1321 }
1322 // If BB doesn't have a post-dominating successor, it doesn't form a
1323 // triangle.
1324 if (PDom == nullptr)
1325 continue;
1326 // If PDom has a hint that it is low probability, skip this triangle.
1327 if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1328 continue;
1329 // If PDom isn't eligible for duplication, this isn't the kind of triangle
1330 // we're looking for.
1331 if (!shouldTailDuplicate(PDom))
1332 continue;
1333 bool CanTailDuplicate = true;
1334 // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1335 // isn't the kind of triangle we're looking for.
1336 for (MachineBasicBlock *Pred : PDom->predecessors()) {
1337 if (Pred == &BB)
1338 continue;
1339 if (!TailDup.canTailDuplicate(PDom, Pred)) {
1340 CanTailDuplicate = false;
1341 break;
1342 }
1343 }
1344 // If we can't tail-duplicate PDom to its predecessors, then skip this
1345 // triangle.
1346 if (!CanTailDuplicate)
1347 continue;
1348
1349 // Now we have an interesting triangle. Insert it if it's not part of an
1350 // existing chain.
1351 // Note: This cannot be replaced with a call insert() or emplace() because
1352 // the find key is BB, but the insert/emplace key is PDom.
1353 auto Found = TriangleChainMap.find(&BB);
1354 // If it is, remove the chain from the map, grow it, and put it back in the
1355 // map with the end as the new key.
1356 if (Found != TriangleChainMap.end()) {
1357 TriangleChain Chain = std::move(Found->second);
1358 TriangleChainMap.erase(Found);
1359 Chain.append(PDom);
1360 TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1361 } else {
1362 auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1363 assert(InsertResult.second && "Block seen twice.");
1364 (void)InsertResult;
1365 }
1366 }
1367
1368 // Iterating over a DenseMap is safe here, because the only thing in the body
1369 // of the loop is inserting into another DenseMap (ComputedEdges).
1370 // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1371 for (auto &ChainPair : TriangleChainMap) {
1372 TriangleChain &Chain = ChainPair.second;
1373 // Benchmarking has shown that due to branch correlation duplicating 2 or
1374 // more triangles is profitable, despite the calculations assuming
1375 // independence.
1376 if (Chain.count() < TriangleChainCount)
1377 continue;
1378 MachineBasicBlock *dst = Chain.Edges.back();
1379 Chain.Edges.pop_back();
1380 for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1381 LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1382 << getBlockName(dst)
1383 << " as pre-computed based on triangles.\n");
1384
1385 auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1386 assert(InsertResult.second && "Block seen twice.");
1387 (void)InsertResult;
1388
1389 dst = src;
1390 }
1391 }
1392}
1393
1394// When profile is not present, return the StaticLikelyProb.
1395// When profile is available, we need to handle the triangle-shape CFG.
1396static BranchProbability
1398 if (!BB->getParent()->getFunction().hasProfileData())
1400 if (BB->succ_size() == 2) {
1401 const MachineBasicBlock *Succ1 = *BB->succ_begin();
1402 const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1403 if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1404 /* See case 1 below for the cost analysis. For BB->Succ to
1405 * be taken with smaller cost, the following needs to hold:
1406 * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1407 * So the threshold T in the calculation below
1408 * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1409 * So T / (1 - T) = 2, Yielding T = 2/3
1410 * Also adding user specified branch bias, we have
1411 * T = (2/3)*(ProfileLikelyProb/50)
1412 * = (2*ProfileLikelyProb)/150)
1413 */
1414 return BranchProbability(2 * ProfileLikelyProb, 150);
1415 }
1416 }
1418}
1419
1420/// Checks to see if the layout candidate block \p Succ has a better layout
1421/// predecessor than \c BB. If yes, returns true.
1422/// \p SuccProb: The probability adjusted for only remaining blocks.
1423/// Only used for logging
1424/// \p RealSuccProb: The un-adjusted probability.
1425/// \p Chain: The chain that BB belongs to and Succ is being considered for.
1426/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1427/// considered
1428bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1429 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1430 const BlockChain &SuccChain, BranchProbability SuccProb,
1431 BranchProbability RealSuccProb, const BlockChain &Chain,
1432 const BlockFilterSet *BlockFilter) {
1433
1434 // There isn't a better layout when there are no unscheduled predecessors.
1435 if (SuccChain.UnscheduledPredecessors == 0)
1436 return false;
1437
1438 // There are two basic scenarios here:
1439 // -------------------------------------
1440 // Case 1: triangular shape CFG (if-then):
1441 // BB
1442 // | \
1443 // | \
1444 // | Pred
1445 // | /
1446 // Succ
1447 // In this case, we are evaluating whether to select edge -> Succ, e.g.
1448 // set Succ as the layout successor of BB. Picking Succ as BB's
1449 // successor breaks the CFG constraints (FIXME: define these constraints).
1450 // With this layout, Pred BB
1451 // is forced to be outlined, so the overall cost will be cost of the
1452 // branch taken from BB to Pred, plus the cost of back taken branch
1453 // from Pred to Succ, as well as the additional cost associated
1454 // with the needed unconditional jump instruction from Pred To Succ.
1455
1456 // The cost of the topological order layout is the taken branch cost
1457 // from BB to Succ, so to make BB->Succ a viable candidate, the following
1458 // must hold:
1459 // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1460 // < freq(BB->Succ) * taken_branch_cost.
1461 // Ignoring unconditional jump cost, we get
1462 // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1463 // prob(BB->Succ) > 2 * prob(BB->Pred)
1464 //
1465 // When real profile data is available, we can precisely compute the
1466 // probability threshold that is needed for edge BB->Succ to be considered.
1467 // Without profile data, the heuristic requires the branch bias to be
1468 // a lot larger to make sure the signal is very strong (e.g. 80% default).
1469 // -----------------------------------------------------------------
1470 // Case 2: diamond like CFG (if-then-else):
1471 // S
1472 // / \
1473 // | \
1474 // BB Pred
1475 // \ /
1476 // Succ
1477 // ..
1478 //
1479 // The current block is BB and edge BB->Succ is now being evaluated.
1480 // Note that edge S->BB was previously already selected because
1481 // prob(S->BB) > prob(S->Pred).
1482 // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1483 // choose Pred, we will have a topological ordering as shown on the left
1484 // in the picture below. If we choose Succ, we have the solution as shown
1485 // on the right:
1486 //
1487 // topo-order:
1488 //
1489 // S----- ---S
1490 // | | | |
1491 // ---BB | | BB
1492 // | | | |
1493 // | Pred-- | Succ--
1494 // | | | |
1495 // ---Succ ---Pred--
1496 //
1497 // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1498 // = freq(S->Pred) + freq(S->BB)
1499 //
1500 // If we have profile data (i.e, branch probabilities can be trusted), the
1501 // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1502 // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1503 // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1504 // means the cost of topological order is greater.
1505 // When profile data is not available, however, we need to be more
1506 // conservative. If the branch prediction is wrong, breaking the topo-order
1507 // will actually yield a layout with large cost. For this reason, we need
1508 // strong biased branch at block S with Prob(S->BB) in order to select
1509 // BB->Succ. This is equivalent to looking the CFG backward with backward
1510 // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1511 // profile data).
1512 // --------------------------------------------------------------------------
1513 // Case 3: forked diamond
1514 // S
1515 // / \
1516 // / \
1517 // BB Pred
1518 // | \ / |
1519 // | \ / |
1520 // | X |
1521 // | / \ |
1522 // | / \ |
1523 // S1 S2
1524 //
1525 // The current block is BB and edge BB->S1 is now being evaluated.
1526 // As above S->BB was already selected because
1527 // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1528 //
1529 // topo-order:
1530 //
1531 // S-------| ---S
1532 // | | | |
1533 // ---BB | | BB
1534 // | | | |
1535 // | Pred----| | S1----
1536 // | | | |
1537 // --(S1 or S2) ---Pred--
1538 // |
1539 // S2
1540 //
1541 // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1542 // + min(freq(Pred->S1), freq(Pred->S2))
1543 // Non-topo-order cost:
1544 // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1545 // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1546 // is 0. Then the non topo layout is better when
1547 // freq(S->Pred) < freq(BB->S1).
1548 // This is exactly what is checked below.
1549 // Note there are other shapes that apply (Pred may not be a single block,
1550 // but they all fit this general pattern.)
1552
1553 // Make sure that a hot successor doesn't have a globally more
1554 // important predecessor.
1555 BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1556 bool BadCFGConflict = false;
1557
1558 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1559 BlockChain *PredChain = BlockToChain[Pred];
1560 if (Pred == Succ || PredChain == &SuccChain ||
1561 (BlockFilter && !BlockFilter->count(Pred)) || PredChain == &Chain ||
1562 Pred != *std::prev(PredChain->end()) ||
1563 // This check is redundant except for look ahead. This function is
1564 // called for lookahead by isProfitableToTailDup when BB hasn't been
1565 // placed yet.
1566 (Pred == BB))
1567 continue;
1568 // Do backward checking.
1569 // For all cases above, we need a backward checking to filter out edges that
1570 // are not 'strongly' biased.
1571 // BB Pred
1572 // \ /
1573 // Succ
1574 // We select edge BB->Succ if
1575 // freq(BB->Succ) > freq(Succ) * HotProb
1576 // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1577 // HotProb
1578 // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1579 // Case 1 is covered too, because the first equation reduces to:
1580 // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1581 BlockFrequency PredEdgeFreq =
1582 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1583 if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1584 BadCFGConflict = true;
1585 break;
1586 }
1587 }
1588
1589 if (BadCFGConflict) {
1590 LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1591 << SuccProb << " (prob) (non-cold CFG conflict)\n");
1592 return true;
1593 }
1594
1595 return false;
1596}
1597
1598/// Select the best successor for a block.
1599///
1600/// This looks across all successors of a particular block and attempts to
1601/// select the "best" one to be the layout successor. It only considers direct
1602/// successors which also pass the block filter. It will attempt to avoid
1603/// breaking CFG structure, but cave and break such structures in the case of
1604/// very hot successor edges.
1605///
1606/// \returns The best successor block found, or null if none are viable, along
1607/// with a boolean indicating if tail duplication is necessary.
1608MachineBlockPlacement::BlockAndTailDupResult
1609MachineBlockPlacement::selectBestSuccessor(const MachineBasicBlock *BB,
1610 const BlockChain &Chain,
1611 const BlockFilterSet *BlockFilter) {
1612 const BranchProbability HotProb(StaticLikelyProb, 100);
1613
1614 BlockAndTailDupResult BestSucc = {nullptr, false};
1615 auto BestProb = BranchProbability::getZero();
1616
1618 auto AdjustedSumProb =
1619 collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1620
1621 LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1622 << "\n");
1623
1624 // if we already precomputed the best successor for BB, return that if still
1625 // applicable.
1626 auto FoundEdge = ComputedEdges.find(BB);
1627 if (FoundEdge != ComputedEdges.end()) {
1628 MachineBasicBlock *Succ = FoundEdge->second.BB;
1629 ComputedEdges.erase(FoundEdge);
1630 BlockChain *SuccChain = BlockToChain[Succ];
1631 if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1632 SuccChain != &Chain && Succ == *SuccChain->begin())
1633 return FoundEdge->second;
1634 }
1635
1636 // if BB is part of a trellis, Use the trellis to determine the optimal
1637 // fallthrough edges
1638 if (isTrellis(BB, Successors, Chain, BlockFilter))
1639 return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1640 BlockFilter);
1641
1642 // For blocks with CFG violations, we may be able to lay them out anyway with
1643 // tail-duplication. We keep this vector so we can perform the probability
1644 // calculations the minimum number of times.
1646 DupCandidates;
1647 for (MachineBasicBlock *Succ : Successors) {
1648 auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1649 BranchProbability SuccProb =
1650 getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1651
1652 BlockChain &SuccChain = *BlockToChain[Succ];
1653 // Skip the edge \c BB->Succ if block \c Succ has a better layout
1654 // predecessor that yields lower global cost.
1655 if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1656 Chain, BlockFilter)) {
1657 // If tail duplication would make Succ profitable, place it.
1658 if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1659 DupCandidates.emplace_back(SuccProb, Succ);
1660 continue;
1661 }
1662
1663 LLVM_DEBUG(
1664 dbgs() << " Candidate: " << getBlockName(Succ)
1665 << ", probability: " << SuccProb
1666 << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1667 << "\n");
1668
1669 if (BestSucc.BB && BestProb >= SuccProb) {
1670 LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1671 continue;
1672 }
1673
1674 LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1675 BestSucc.BB = Succ;
1676 BestProb = SuccProb;
1677 }
1678 // Handle the tail duplication candidates in order of decreasing probability.
1679 // Stop at the first one that is profitable. Also stop if they are less
1680 // profitable than BestSucc. Position is important because we preserve it and
1681 // prefer first best match. Here we aren't comparing in order, so we capture
1682 // the position instead.
1683 llvm::stable_sort(DupCandidates,
1684 [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1685 std::tuple<BranchProbability, MachineBasicBlock *> R) {
1686 return std::get<0>(L) > std::get<0>(R);
1687 });
1688 for (auto &Tup : DupCandidates) {
1689 BranchProbability DupProb;
1690 MachineBasicBlock *Succ;
1691 std::tie(DupProb, Succ) = Tup;
1692 if (DupProb < BestProb)
1693 break;
1694 if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) &&
1695 (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1696 LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1697 << ", probability: " << DupProb
1698 << " (Tail Duplicate)\n");
1699 BestSucc.BB = Succ;
1700 BestSucc.ShouldTailDup = true;
1701 break;
1702 }
1703 }
1704
1705 if (BestSucc.BB)
1706 LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1707
1708 return BestSucc;
1709}
1710
1711/// Select the best block from a worklist.
1712///
1713/// This looks through the provided worklist as a list of candidate basic
1714/// blocks and select the most profitable one to place. The definition of
1715/// profitable only really makes sense in the context of a loop. This returns
1716/// the most frequently visited block in the worklist, which in the case of
1717/// a loop, is the one most desirable to be physically close to the rest of the
1718/// loop body in order to improve i-cache behavior.
1719///
1720/// \returns The best block found, or null if none are viable.
1721MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1722 const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1723 // Once we need to walk the worklist looking for a candidate, cleanup the
1724 // worklist of already placed entries.
1725 // FIXME: If this shows up on profiles, it could be folded (at the cost of
1726 // some code complexity) into the loop below.
1727 llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) {
1728 return BlockToChain.lookup(BB) == &Chain;
1729 });
1730
1731 if (WorkList.empty())
1732 return nullptr;
1733
1734 bool IsEHPad = WorkList[0]->isEHPad();
1735
1736 MachineBasicBlock *BestBlock = nullptr;
1737 BlockFrequency BestFreq;
1738 for (MachineBasicBlock *MBB : WorkList) {
1739 assert(MBB->isEHPad() == IsEHPad &&
1740 "EHPad mismatch between block and work list.");
1741
1742 BlockChain &SuccChain = *BlockToChain[MBB];
1743 if (&SuccChain == &Chain)
1744 continue;
1745
1746 assert(SuccChain.UnscheduledPredecessors == 0 &&
1747 "Found CFG-violating block");
1748
1749 BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1750 LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> "
1751 << printBlockFreq(MBFI->getMBFI(), CandidateFreq)
1752 << " (freq)\n");
1753
1754 // For ehpad, we layout the least probable first as to avoid jumping back
1755 // from least probable landingpads to more probable ones.
1756 //
1757 // FIXME: Using probability is probably (!) not the best way to achieve
1758 // this. We should probably have a more principled approach to layout
1759 // cleanup code.
1760 //
1761 // The goal is to get:
1762 //
1763 // +--------------------------+
1764 // | V
1765 // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1766 //
1767 // Rather than:
1768 //
1769 // +-------------------------------------+
1770 // V |
1771 // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1772 if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1773 continue;
1774
1775 BestBlock = MBB;
1776 BestFreq = CandidateFreq;
1777 }
1778
1779 return BestBlock;
1780}
1781
1782/// Retrieve the first unplaced basic block in the entire function.
1783///
1784/// This routine is called when we are unable to use the CFG to walk through
1785/// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1786/// We walk through the function's blocks in order, starting from the
1787/// LastUnplacedBlockIt. We update this iterator on each call to avoid
1788/// re-scanning the entire sequence on repeated calls to this routine.
1789MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1790 const BlockChain &PlacedChain,
1791 MachineFunction::iterator &PrevUnplacedBlockIt) {
1792
1793 for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1794 ++I) {
1795 if (BlockToChain[&*I] != &PlacedChain) {
1796 PrevUnplacedBlockIt = I;
1797 // Now select the head of the chain to which the unplaced block belongs
1798 // as the block to place. This will force the entire chain to be placed,
1799 // and satisfies the requirements of merging chains.
1800 return *BlockToChain[&*I]->begin();
1801 }
1802 }
1803 return nullptr;
1804}
1805
1806/// Retrieve the first unplaced basic block among the blocks in BlockFilter.
1807///
1808/// This is similar to getFirstUnplacedBlock for the entire function, but since
1809/// the size of BlockFilter is typically far less than the number of blocks in
1810/// the entire function, iterating through the BlockFilter is more efficient.
1811/// When processing the entire funciton, using the version without BlockFilter
1812/// has a complexity of #(loops in function) * #(blocks in function), while this
1813/// version has a complexity of sum(#(loops in block) foreach block in function)
1814/// which is always smaller. For long function mostly sequential in structure,
1815/// the complexity is amortized to 1 * #(blocks in function).
1816MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1817 const BlockChain &PlacedChain,
1818 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
1819 const BlockFilterSet *BlockFilter) {
1820 assert(BlockFilter);
1821 for (; PrevUnplacedBlockInFilterIt != BlockFilter->end();
1822 ++PrevUnplacedBlockInFilterIt) {
1823 BlockChain *C = BlockToChain[*PrevUnplacedBlockInFilterIt];
1824 if (C != &PlacedChain) {
1825 return *C->begin();
1826 }
1827 }
1828 return nullptr;
1829}
1830
1831void MachineBlockPlacement::fillWorkLists(
1833 const BlockFilterSet *BlockFilter = nullptr) {
1834 BlockChain &Chain = *BlockToChain[MBB];
1835 if (!UpdatedPreds.insert(&Chain).second)
1836 return;
1837
1838 assert(
1839 Chain.UnscheduledPredecessors == 0 &&
1840 "Attempting to place block with unscheduled predecessors in worklist.");
1841 for (MachineBasicBlock *ChainBB : Chain) {
1842 assert(BlockToChain[ChainBB] == &Chain &&
1843 "Block in chain doesn't match BlockToChain map.");
1844 for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1845 if (BlockFilter && !BlockFilter->count(Pred))
1846 continue;
1847 if (BlockToChain[Pred] == &Chain)
1848 continue;
1849 ++Chain.UnscheduledPredecessors;
1850 }
1851 }
1852
1853 if (Chain.UnscheduledPredecessors != 0)
1854 return;
1855
1856 MachineBasicBlock *BB = *Chain.begin();
1857 if (BB->isEHPad())
1858 EHPadWorkList.push_back(BB);
1859 else
1860 BlockWorkList.push_back(BB);
1861}
1862
1863void MachineBlockPlacement::buildChain(const MachineBasicBlock *HeadBB,
1864 BlockChain &Chain,
1865 BlockFilterSet *BlockFilter) {
1866 assert(HeadBB && "BB must not be null.\n");
1867 assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1868 MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1869 BlockFilterSet::iterator PrevUnplacedBlockInFilterIt;
1870 if (BlockFilter)
1871 PrevUnplacedBlockInFilterIt = BlockFilter->begin();
1872
1873 const MachineBasicBlock *LoopHeaderBB = HeadBB;
1874 markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1875 MachineBasicBlock *BB = *std::prev(Chain.end());
1876 while (true) {
1877 assert(BB && "null block found at end of chain in loop.");
1878 assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1879 assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1880
1881 // Look for the best viable successor if there is one to place immediately
1882 // after this block.
1883 auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1884 MachineBasicBlock *BestSucc = Result.BB;
1885 bool ShouldTailDup = Result.ShouldTailDup;
1886 if (allowTailDupPlacement())
1887 ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(
1888 BB, BestSucc, Chain, BlockFilter));
1889
1890 // If an immediate successor isn't available, look for the best viable
1891 // block among those we've identified as not violating the loop's CFG at
1892 // this point. This won't be a fallthrough, but it will increase locality.
1893 if (!BestSucc)
1894 BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1895 if (!BestSucc)
1896 BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1897
1898 if (!BestSucc) {
1899 if (BlockFilter)
1900 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockInFilterIt,
1901 BlockFilter);
1902 else
1903 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt);
1904 if (!BestSucc)
1905 break;
1906
1907 LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1908 "layout successor until the CFG reduces\n");
1909 }
1910
1911 // Placement may have changed tail duplication opportunities.
1912 // Check for that now.
1913 if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1914 repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1915 BlockFilter, PrevUnplacedBlockIt,
1916 PrevUnplacedBlockInFilterIt);
1917 // If the chosen successor was duplicated into BB, don't bother laying
1918 // it out, just go round the loop again with BB as the chain end.
1919 if (!BB->isSuccessor(BestSucc))
1920 continue;
1921 }
1922
1923 // Place this block, updating the datastructures to reflect its placement.
1924 BlockChain &SuccChain = *BlockToChain[BestSucc];
1925 // Zero out UnscheduledPredecessors for the successor we're about to merge
1926 // in case we selected a successor that didn't fit naturally into the CFG.
1927 SuccChain.UnscheduledPredecessors = 0;
1928 LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1929 << getBlockName(BestSucc) << "\n");
1930 markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1931 Chain.merge(BestSucc, &SuccChain);
1932 BB = *std::prev(Chain.end());
1933 }
1934
1935 LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1936 << getBlockName(*Chain.begin()) << "\n");
1937}
1938
1939// If bottom of block BB has only one successor OldTop, in most cases it is
1940// profitable to move it before OldTop, except the following case:
1941//
1942// -->OldTop<-
1943// | . |
1944// | . |
1945// | . |
1946// ---Pred |
1947// | |
1948// BB-----
1949//
1950// If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
1951// layout the other successor below it, so it can't reduce taken branch.
1952// In this case we keep its original layout.
1953bool MachineBlockPlacement::canMoveBottomBlockToTop(
1954 const MachineBasicBlock *BottomBlock, const MachineBasicBlock *OldTop) {
1955 if (BottomBlock->pred_size() != 1)
1956 return true;
1957 MachineBasicBlock *Pred = *BottomBlock->pred_begin();
1958 if (Pred->succ_size() != 2)
1959 return true;
1960
1961 MachineBasicBlock *OtherBB = *Pred->succ_begin();
1962 if (OtherBB == BottomBlock)
1963 OtherBB = *Pred->succ_rbegin();
1964 if (OtherBB == OldTop)
1965 return false;
1966
1967 return true;
1968}
1969
1970// Find out the possible fall through frequence to the top of a loop.
1972MachineBlockPlacement::TopFallThroughFreq(const MachineBasicBlock *Top,
1973 const BlockFilterSet &LoopBlockSet) {
1974 BlockFrequency MaxFreq = BlockFrequency(0);
1975 for (MachineBasicBlock *Pred : Top->predecessors()) {
1976 BlockChain *PredChain = BlockToChain[Pred];
1977 if (!LoopBlockSet.count(Pred) &&
1978 (!PredChain || Pred == *std::prev(PredChain->end()))) {
1979 // Found a Pred block can be placed before Top.
1980 // Check if Top is the best successor of Pred.
1981 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
1982 bool TopOK = true;
1983 for (MachineBasicBlock *Succ : Pred->successors()) {
1984 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
1985 BlockChain *SuccChain = BlockToChain[Succ];
1986 // Check if Succ can be placed after Pred.
1987 // Succ should not be in any chain, or it is the head of some chain.
1988 if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
1989 (!SuccChain || Succ == *SuccChain->begin())) {
1990 TopOK = false;
1991 break;
1992 }
1993 }
1994 if (TopOK) {
1995 BlockFrequency EdgeFreq =
1996 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Top);
1997 if (EdgeFreq > MaxFreq)
1998 MaxFreq = EdgeFreq;
1999 }
2000 }
2001 }
2002 return MaxFreq;
2003}
2004
2005// Compute the fall through gains when move NewTop before OldTop.
2006//
2007// In following diagram, edges marked as "-" are reduced fallthrough, edges
2008// marked as "+" are increased fallthrough, this function computes
2009//
2010// SUM(increased fallthrough) - SUM(decreased fallthrough)
2011//
2012// |
2013// | -
2014// V
2015// --->OldTop
2016// | .
2017// | .
2018// +| . +
2019// | Pred --->
2020// | |-
2021// | V
2022// --- NewTop <---
2023// |-
2024// V
2025//
2026BlockFrequency MachineBlockPlacement::FallThroughGains(
2027 const MachineBasicBlock *NewTop, const MachineBasicBlock *OldTop,
2028 const MachineBasicBlock *ExitBB, const BlockFilterSet &LoopBlockSet) {
2029 BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
2030 BlockFrequency FallThrough2Exit = BlockFrequency(0);
2031 if (ExitBB)
2032 FallThrough2Exit =
2033 MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, ExitBB);
2034 BlockFrequency BackEdgeFreq =
2035 MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, OldTop);
2036
2037 // Find the best Pred of NewTop.
2038 MachineBasicBlock *BestPred = nullptr;
2039 BlockFrequency FallThroughFromPred = BlockFrequency(0);
2040 for (MachineBasicBlock *Pred : NewTop->predecessors()) {
2041 if (!LoopBlockSet.count(Pred))
2042 continue;
2043 BlockChain *PredChain = BlockToChain[Pred];
2044 if (!PredChain || Pred == *std::prev(PredChain->end())) {
2045 BlockFrequency EdgeFreq =
2046 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, NewTop);
2047 if (EdgeFreq > FallThroughFromPred) {
2048 FallThroughFromPred = EdgeFreq;
2049 BestPred = Pred;
2050 }
2051 }
2052 }
2053
2054 // If NewTop is not placed after Pred, another successor can be placed
2055 // after Pred.
2056 BlockFrequency NewFreq = BlockFrequency(0);
2057 if (BestPred) {
2058 for (MachineBasicBlock *Succ : BestPred->successors()) {
2059 if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
2060 continue;
2061 if (ComputedEdges.contains(Succ))
2062 continue;
2063 BlockChain *SuccChain = BlockToChain[Succ];
2064 if ((SuccChain && (Succ != *SuccChain->begin())) ||
2065 (SuccChain == BlockToChain[BestPred]))
2066 continue;
2067 BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
2068 MBPI->getEdgeProbability(BestPred, Succ);
2069 if (EdgeFreq > NewFreq)
2070 NewFreq = EdgeFreq;
2071 }
2072 BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
2073 MBPI->getEdgeProbability(BestPred, NewTop);
2074 if (NewFreq > OrigEdgeFreq) {
2075 // If NewTop is not the best successor of Pred, then Pred doesn't
2076 // fallthrough to NewTop. So there is no FallThroughFromPred and
2077 // NewFreq.
2078 NewFreq = BlockFrequency(0);
2079 FallThroughFromPred = BlockFrequency(0);
2080 }
2081 }
2082
2084 BlockFrequency Gains = BackEdgeFreq + NewFreq;
2085 BlockFrequency Lost =
2086 FallThrough2Top + FallThrough2Exit + FallThroughFromPred;
2087 if (Gains > Lost)
2088 Result = Gains - Lost;
2089 return Result;
2090}
2091
2092/// Helper function of findBestLoopTop. Find the best loop top block
2093/// from predecessors of old top.
2094///
2095/// Look for a block which is strictly better than the old top for laying
2096/// out before the old top of the loop. This looks for only two patterns:
2097///
2098/// 1. a block has only one successor, the old loop top
2099///
2100/// Because such a block will always result in an unconditional jump,
2101/// rotating it in front of the old top is always profitable.
2102///
2103/// 2. a block has two successors, one is old top, another is exit
2104/// and it has more than one predecessors
2105///
2106/// If it is below one of its predecessors P, only P can fall through to
2107/// it, all other predecessors need a jump to it, and another conditional
2108/// jump to loop header. If it is moved before loop header, all its
2109/// predecessors jump to it, then fall through to loop header. So all its
2110/// predecessors except P can reduce one taken branch.
2111/// At the same time, move it before old top increases the taken branch
2112/// to loop exit block, so the reduced taken branch will be compared with
2113/// the increased taken branch to the loop exit block.
2114MachineBasicBlock *MachineBlockPlacement::findBestLoopTopHelper(
2115 MachineBasicBlock *OldTop, const MachineLoop &L,
2116 const BlockFilterSet &LoopBlockSet) {
2117 // Check that the header hasn't been fused with a preheader block due to
2118 // crazy branches. If it has, we need to start with the header at the top to
2119 // prevent pulling the preheader into the loop body.
2120 BlockChain &HeaderChain = *BlockToChain[OldTop];
2121 if (!LoopBlockSet.count(*HeaderChain.begin()))
2122 return OldTop;
2123 if (OldTop != *HeaderChain.begin())
2124 return OldTop;
2125
2126 LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
2127 << "\n");
2128
2129 BlockFrequency BestGains = BlockFrequency(0);
2130 MachineBasicBlock *BestPred = nullptr;
2131 for (MachineBasicBlock *Pred : OldTop->predecessors()) {
2132 if (!LoopBlockSet.count(Pred))
2133 continue;
2134 if (Pred == L.getHeader())
2135 continue;
2136 LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
2137 << Pred->succ_size() << " successors, "
2138 << printBlockFreq(MBFI->getMBFI(), *Pred) << " freq\n");
2139 if (Pred->succ_size() > 2)
2140 continue;
2141
2142 MachineBasicBlock *OtherBB = nullptr;
2143 if (Pred->succ_size() == 2) {
2144 OtherBB = *Pred->succ_begin();
2145 if (OtherBB == OldTop)
2146 OtherBB = *Pred->succ_rbegin();
2147 }
2148
2149 if (!canMoveBottomBlockToTop(Pred, OldTop))
2150 continue;
2151
2152 BlockFrequency Gains =
2153 FallThroughGains(Pred, OldTop, OtherBB, LoopBlockSet);
2154 if ((Gains > BlockFrequency(0)) &&
2155 (Gains > BestGains ||
2156 ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
2157 BestPred = Pred;
2158 BestGains = Gains;
2159 }
2160 }
2161
2162 // If no direct predecessor is fine, just use the loop header.
2163 if (!BestPred) {
2164 LLVM_DEBUG(dbgs() << " final top unchanged\n");
2165 return OldTop;
2166 }
2167
2168 // Walk backwards through any straight line of predecessors.
2169 while (BestPred->pred_size() == 1 &&
2170 (*BestPred->pred_begin())->succ_size() == 1 &&
2171 *BestPred->pred_begin() != L.getHeader())
2172 BestPred = *BestPred->pred_begin();
2173
2174 LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2175 return BestPred;
2176}
2177
2178/// Find the best loop top block for layout.
2179///
2180/// This function iteratively calls findBestLoopTopHelper, until no new better
2181/// BB can be found.
2183MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2184 const BlockFilterSet &LoopBlockSet) {
2185 // Placing the latch block before the header may introduce an extra branch
2186 // that skips this block the first time the loop is executed, which we want
2187 // to avoid when optimising for size.
2188 // FIXME: in theory there is a case that does not introduce a new branch,
2189 // i.e. when the layout predecessor does not fallthrough to the loop header.
2190 // In practice this never happens though: there always seems to be a preheader
2191 // that can fallthrough and that is also placed before the header.
2192 if (llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get()))
2193 return L.getHeader();
2194
2195 MachineBasicBlock *OldTop = nullptr;
2196 MachineBasicBlock *NewTop = L.getHeader();
2197 while (NewTop != OldTop) {
2198 OldTop = NewTop;
2199 NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2200 if (NewTop != OldTop)
2201 ComputedEdges[NewTop] = {OldTop, false};
2202 }
2203 return NewTop;
2204}
2205
2206/// Find the best loop exiting block for layout.
2207///
2208/// This routine implements the logic to analyze the loop looking for the best
2209/// block to layout at the top of the loop. Typically this is done to maximize
2210/// fallthrough opportunities.
2212MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2213 const BlockFilterSet &LoopBlockSet,
2214 BlockFrequency &ExitFreq) {
2215 // We don't want to layout the loop linearly in all cases. If the loop header
2216 // is just a normal basic block in the loop, we want to look for what block
2217 // within the loop is the best one to layout at the top. However, if the loop
2218 // header has be pre-merged into a chain due to predecessors not having
2219 // analyzable branches, *and* the predecessor it is merged with is *not* part
2220 // of the loop, rotating the header into the middle of the loop will create
2221 // a non-contiguous range of blocks which is Very Bad. So start with the
2222 // header and only rotate if safe.
2223 BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2224 if (!LoopBlockSet.count(*HeaderChain.begin()))
2225 return nullptr;
2226
2227 BlockFrequency BestExitEdgeFreq;
2228 unsigned BestExitLoopDepth = 0;
2229 MachineBasicBlock *ExitingBB = nullptr;
2230 // If there are exits to outer loops, loop rotation can severely limit
2231 // fallthrough opportunities unless it selects such an exit. Keep a set of
2232 // blocks where rotating to exit with that block will reach an outer loop.
2233 SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2234
2235 LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2236 << getBlockName(L.getHeader()) << "\n");
2237 for (MachineBasicBlock *MBB : L.getBlocks()) {
2238 BlockChain &Chain = *BlockToChain[MBB];
2239 // Ensure that this block is at the end of a chain; otherwise it could be
2240 // mid-way through an inner loop or a successor of an unanalyzable branch.
2241 if (MBB != *std::prev(Chain.end()))
2242 continue;
2243
2244 // Now walk the successors. We need to establish whether this has a viable
2245 // exiting successor and whether it has a viable non-exiting successor.
2246 // We store the old exiting state and restore it if a viable looping
2247 // successor isn't found.
2248 MachineBasicBlock *OldExitingBB = ExitingBB;
2249 BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2250 bool HasLoopingSucc = false;
2251 for (MachineBasicBlock *Succ : MBB->successors()) {
2252 if (Succ->isEHPad())
2253 continue;
2254 if (Succ == MBB)
2255 continue;
2256 BlockChain &SuccChain = *BlockToChain[Succ];
2257 // Don't split chains, either this chain or the successor's chain.
2258 if (&Chain == &SuccChain) {
2259 LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2260 << getBlockName(Succ) << " (chain conflict)\n");
2261 continue;
2262 }
2263
2264 auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2265 if (LoopBlockSet.count(Succ)) {
2266 LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2267 << getBlockName(Succ) << " (" << SuccProb << ")\n");
2268 HasLoopingSucc = true;
2269 continue;
2270 }
2271
2272 unsigned SuccLoopDepth = 0;
2273 if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2274 SuccLoopDepth = ExitLoop->getLoopDepth();
2275 if (ExitLoop->contains(&L))
2276 BlocksExitingToOuterLoop.insert(MBB);
2277 }
2278
2279 BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2280 LLVM_DEBUG(
2281 dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2282 << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("
2283 << printBlockFreq(MBFI->getMBFI(), ExitEdgeFreq) << ")\n");
2284 // Note that we bias this toward an existing layout successor to retain
2285 // incoming order in the absence of better information. The exit must have
2286 // a frequency higher than the current exit before we consider breaking
2287 // the layout.
2288 BranchProbability Bias(100 - ExitBlockBias, 100);
2289 if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2290 ExitEdgeFreq > BestExitEdgeFreq ||
2291 (MBB->isLayoutSuccessor(Succ) &&
2292 !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2293 BestExitEdgeFreq = ExitEdgeFreq;
2294 ExitingBB = MBB;
2295 }
2296 }
2297
2298 if (!HasLoopingSucc) {
2299 // Restore the old exiting state, no viable looping successor was found.
2300 ExitingBB = OldExitingBB;
2301 BestExitEdgeFreq = OldBestExitEdgeFreq;
2302 }
2303 }
2304 // Without a candidate exiting block or with only a single block in the
2305 // loop, just use the loop header to layout the loop.
2306 if (!ExitingBB) {
2307 LLVM_DEBUG(
2308 dbgs() << " No other candidate exit blocks, using loop header\n");
2309 return nullptr;
2310 }
2311 if (L.getNumBlocks() == 1) {
2312 LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2313 return nullptr;
2314 }
2315
2316 // Also, if we have exit blocks which lead to outer loops but didn't select
2317 // one of them as the exiting block we are rotating toward, disable loop
2318 // rotation altogether.
2319 if (!BlocksExitingToOuterLoop.empty() &&
2320 !BlocksExitingToOuterLoop.count(ExitingBB))
2321 return nullptr;
2322
2323 LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2324 << "\n");
2325 ExitFreq = BestExitEdgeFreq;
2326 return ExitingBB;
2327}
2328
2329/// Check if there is a fallthrough to loop header Top.
2330///
2331/// 1. Look for a Pred that can be layout before Top.
2332/// 2. Check if Top is the most possible successor of Pred.
2333bool MachineBlockPlacement::hasViableTopFallthrough(
2334 const MachineBasicBlock *Top, const BlockFilterSet &LoopBlockSet) {
2335 for (MachineBasicBlock *Pred : Top->predecessors()) {
2336 BlockChain *PredChain = BlockToChain[Pred];
2337 if (!LoopBlockSet.count(Pred) &&
2338 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2339 // Found a Pred block can be placed before Top.
2340 // Check if Top is the best successor of Pred.
2341 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2342 bool TopOK = true;
2343 for (MachineBasicBlock *Succ : Pred->successors()) {
2344 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2345 BlockChain *SuccChain = BlockToChain[Succ];
2346 // Check if Succ can be placed after Pred.
2347 // Succ should not be in any chain, or it is the head of some chain.
2348 if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2349 TopOK = false;
2350 break;
2351 }
2352 }
2353 if (TopOK)
2354 return true;
2355 }
2356 }
2357 return false;
2358}
2359
2360/// Attempt to rotate an exiting block to the bottom of the loop.
2361///
2362/// Once we have built a chain, try to rotate it to line up the hot exit block
2363/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2364/// branches. For example, if the loop has fallthrough into its header and out
2365/// of its bottom already, don't rotate it.
2366void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2367 const MachineBasicBlock *ExitingBB,
2368 BlockFrequency ExitFreq,
2369 const BlockFilterSet &LoopBlockSet) {
2370 if (!ExitingBB)
2371 return;
2372
2373 MachineBasicBlock *Top = *LoopChain.begin();
2374 MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2375
2376 // If ExitingBB is already the last one in a chain then nothing to do.
2377 if (Bottom == ExitingBB)
2378 return;
2379
2380 // The entry block should always be the first BB in a function.
2381 if (Top->isEntryBlock())
2382 return;
2383
2384 bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2385
2386 // If the header has viable fallthrough, check whether the current loop
2387 // bottom is a viable exiting block. If so, bail out as rotating will
2388 // introduce an unnecessary branch.
2389 if (ViableTopFallthrough) {
2390 for (MachineBasicBlock *Succ : Bottom->successors()) {
2391 BlockChain *SuccChain = BlockToChain[Succ];
2392 if (!LoopBlockSet.count(Succ) &&
2393 (!SuccChain || Succ == *SuccChain->begin()))
2394 return;
2395 }
2396
2397 // Rotate will destroy the top fallthrough, we need to ensure the new exit
2398 // frequency is larger than top fallthrough.
2399 BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2400 if (FallThrough2Top >= ExitFreq)
2401 return;
2402 }
2403
2404 BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2405 if (ExitIt == LoopChain.end())
2406 return;
2407
2408 // Rotating a loop exit to the bottom when there is a fallthrough to top
2409 // trades the entry fallthrough for an exit fallthrough.
2410 // If there is no bottom->top edge, but the chosen exit block does have
2411 // a fallthrough, we break that fallthrough for nothing in return.
2412
2413 // Let's consider an example. We have a built chain of basic blocks
2414 // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2415 // By doing a rotation we get
2416 // Bk+1, ..., Bn, B1, ..., Bk
2417 // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2418 // If we had a fallthrough Bk -> Bk+1 it is broken now.
2419 // It might be compensated by fallthrough Bn -> B1.
2420 // So we have a condition to avoid creation of extra branch by loop rotation.
2421 // All below must be true to avoid loop rotation:
2422 // If there is a fallthrough to top (B1)
2423 // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2424 // There is no fallthrough from bottom (Bn) to top (B1).
2425 // Please note that there is no exit fallthrough from Bn because we checked it
2426 // above.
2427 if (ViableTopFallthrough) {
2428 assert(std::next(ExitIt) != LoopChain.end() &&
2429 "Exit should not be last BB");
2430 MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2431 if (ExitingBB->isSuccessor(NextBlockInChain))
2432 if (!Bottom->isSuccessor(Top))
2433 return;
2434 }
2435
2436 LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2437 << " at bottom\n");
2438 std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2439}
2440
2441/// Attempt to rotate a loop based on profile data to reduce branch cost.
2442///
2443/// With profile data, we can determine the cost in terms of missed fall through
2444/// opportunities when rotating a loop chain and select the best rotation.
2445/// Basically, there are three kinds of cost to consider for each rotation:
2446/// 1. The possibly missed fall through edge (if it exists) from BB out of
2447/// the loop to the loop header.
2448/// 2. The possibly missed fall through edges (if they exist) from the loop
2449/// exits to BB out of the loop.
2450/// 3. The missed fall through edge (if it exists) from the last BB to the
2451/// first BB in the loop chain.
2452/// Therefore, the cost for a given rotation is the sum of costs listed above.
2453/// We select the best rotation with the smallest cost.
2454void MachineBlockPlacement::rotateLoopWithProfile(
2455 BlockChain &LoopChain, const MachineLoop &L,
2456 const BlockFilterSet &LoopBlockSet) {
2457 auto RotationPos = LoopChain.end();
2458 MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2459
2460 // The entry block should always be the first BB in a function.
2461 if (ChainHeaderBB->isEntryBlock())
2462 return;
2463
2464 BlockFrequency SmallestRotationCost = BlockFrequency::max();
2465
2466 // A utility lambda that scales up a block frequency by dividing it by a
2467 // branch probability which is the reciprocal of the scale.
2468 auto ScaleBlockFrequency = [](BlockFrequency Freq,
2469 unsigned Scale) -> BlockFrequency {
2470 if (Scale == 0)
2471 return BlockFrequency(0);
2472 // Use operator / between BlockFrequency and BranchProbability to implement
2473 // saturating multiplication.
2474 return Freq / BranchProbability(1, Scale);
2475 };
2476
2477 // Compute the cost of the missed fall-through edge to the loop header if the
2478 // chain head is not the loop header. As we only consider natural loops with
2479 // single header, this computation can be done only once.
2480 BlockFrequency HeaderFallThroughCost(0);
2481 for (auto *Pred : ChainHeaderBB->predecessors()) {
2482 BlockChain *PredChain = BlockToChain[Pred];
2483 if (!LoopBlockSet.count(Pred) &&
2484 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2485 auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2486 MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2487 auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2488 // If the predecessor has only an unconditional jump to the header, we
2489 // need to consider the cost of this jump.
2490 if (Pred->succ_size() == 1)
2491 FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2492 HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2493 }
2494 }
2495
2496 // Here we collect all exit blocks in the loop, and for each exit we find out
2497 // its hottest exit edge. For each loop rotation, we define the loop exit cost
2498 // as the sum of frequencies of exit edges we collect here, excluding the exit
2499 // edge from the tail of the loop chain.
2501 for (auto *BB : LoopChain) {
2502 auto LargestExitEdgeProb = BranchProbability::getZero();
2503 for (auto *Succ : BB->successors()) {
2504 BlockChain *SuccChain = BlockToChain[Succ];
2505 if (!LoopBlockSet.count(Succ) &&
2506 (!SuccChain || Succ == *SuccChain->begin())) {
2507 auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2508 LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2509 }
2510 }
2511 if (LargestExitEdgeProb > BranchProbability::getZero()) {
2512 auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2513 ExitsWithFreq.emplace_back(BB, ExitFreq);
2514 }
2515 }
2516
2517 // In this loop we iterate every block in the loop chain and calculate the
2518 // cost assuming the block is the head of the loop chain. When the loop ends,
2519 // we should have found the best candidate as the loop chain's head.
2520 for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2521 EndIter = LoopChain.end();
2522 Iter != EndIter; Iter++, TailIter++) {
2523 // TailIter is used to track the tail of the loop chain if the block we are
2524 // checking (pointed by Iter) is the head of the chain.
2525 if (TailIter == LoopChain.end())
2526 TailIter = LoopChain.begin();
2527
2528 auto TailBB = *TailIter;
2529
2530 // Calculate the cost by putting this BB to the top.
2532
2533 // If the current BB is the loop header, we need to take into account the
2534 // cost of the missed fall through edge from outside of the loop to the
2535 // header.
2536 if (Iter != LoopChain.begin())
2537 Cost += HeaderFallThroughCost;
2538
2539 // Collect the loop exit cost by summing up frequencies of all exit edges
2540 // except the one from the chain tail.
2541 for (auto &ExitWithFreq : ExitsWithFreq)
2542 if (TailBB != ExitWithFreq.first)
2543 Cost += ExitWithFreq.second;
2544
2545 // The cost of breaking the once fall-through edge from the tail to the top
2546 // of the loop chain. Here we need to consider three cases:
2547 // 1. If the tail node has only one successor, then we will get an
2548 // additional jmp instruction. So the cost here is (MisfetchCost +
2549 // JumpInstCost) * tail node frequency.
2550 // 2. If the tail node has two successors, then we may still get an
2551 // additional jmp instruction if the layout successor after the loop
2552 // chain is not its CFG successor. Note that the more frequently executed
2553 // jmp instruction will be put ahead of the other one. Assume the
2554 // frequency of those two branches are x and y, where x is the frequency
2555 // of the edge to the chain head, then the cost will be
2556 // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2557 // 3. If the tail node has more than two successors (this rarely happens),
2558 // we won't consider any additional cost.
2559 if (TailBB->isSuccessor(*Iter)) {
2560 auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2561 if (TailBB->succ_size() == 1)
2562 Cost += ScaleBlockFrequency(TailBBFreq, MisfetchCost + JumpInstCost);
2563 else if (TailBB->succ_size() == 2) {
2564 auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2565 auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2566 auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2567 ? TailBBFreq * TailToHeadProb.getCompl()
2568 : TailToHeadFreq;
2569 Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2570 ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2571 }
2572 }
2573
2574 LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2575 << getBlockName(*Iter) << " to the top: "
2576 << printBlockFreq(MBFI->getMBFI(), Cost) << "\n");
2577
2578 if (Cost < SmallestRotationCost) {
2579 SmallestRotationCost = Cost;
2580 RotationPos = Iter;
2581 }
2582 }
2583
2584 if (RotationPos != LoopChain.end()) {
2585 LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2586 << " to the top\n");
2587 std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2588 }
2589}
2590
2591/// Collect blocks in the given loop that are to be placed.
2592///
2593/// When profile data is available, exclude cold blocks from the returned set;
2594/// otherwise, collect all blocks in the loop.
2596MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2597 // Collect the blocks in a set ordered by block number, as this gives the same
2598 // order as they appear in the function.
2599 struct MBBCompare {
2600 bool operator()(const MachineBasicBlock *X,
2601 const MachineBasicBlock *Y) const {
2602 return X->getNumber() < Y->getNumber();
2603 }
2604 };
2605 std::set<const MachineBasicBlock *, MBBCompare> LoopBlockSet;
2606
2607 // Filter cold blocks off from LoopBlockSet when profile data is available.
2608 // Collect the sum of frequencies of incoming edges to the loop header from
2609 // outside. If we treat the loop as a super block, this is the frequency of
2610 // the loop. Then for each block in the loop, we calculate the ratio between
2611 // its frequency and the frequency of the loop block. When it is too small,
2612 // don't add it to the loop chain. If there are outer loops, then this block
2613 // will be merged into the first outer loop chain for which this block is not
2614 // cold anymore. This needs precise profile data and we only do this when
2615 // profile data is available.
2616 if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
2617 BlockFrequency LoopFreq(0);
2618 for (auto *LoopPred : L.getHeader()->predecessors())
2619 if (!L.contains(LoopPred))
2620 LoopFreq += MBFI->getBlockFreq(LoopPred) *
2621 MBPI->getEdgeProbability(LoopPred, L.getHeader());
2622
2623 for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2624 if (LoopBlockSet.count(LoopBB))
2625 continue;
2626 auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2627 if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2628 continue;
2629 BlockChain *Chain = BlockToChain[LoopBB];
2630 for (MachineBasicBlock *ChainBB : *Chain)
2631 LoopBlockSet.insert(ChainBB);
2632 }
2633 } else
2634 LoopBlockSet.insert(L.block_begin(), L.block_end());
2635
2636 // Copy the blocks into a BlockFilterSet, as iterating it is faster than
2637 // std::set. We will only remove blocks and never insert them, which will
2638 // preserve the ordering.
2639 BlockFilterSet Ret(LoopBlockSet.begin(), LoopBlockSet.end());
2640 return Ret;
2641}
2642
2643/// Forms basic block chains from the natural loop structures.
2644///
2645/// These chains are designed to preserve the existing *structure* of the code
2646/// as much as possible. We can then stitch the chains together in a way which
2647/// both preserves the topological structure and minimizes taken conditional
2648/// branches.
2649void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2650 // First recurse through any nested loops, building chains for those inner
2651 // loops.
2652 for (const MachineLoop *InnerLoop : L)
2653 buildLoopChains(*InnerLoop);
2654
2655 assert(BlockWorkList.empty() &&
2656 "BlockWorkList not empty when starting to build loop chains.");
2657 assert(EHPadWorkList.empty() &&
2658 "EHPadWorkList not empty when starting to build loop chains.");
2659 BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2660
2661 // Check if we have profile data for this function. If yes, we will rotate
2662 // this loop by modeling costs more precisely which requires the profile data
2663 // for better layout.
2664 bool RotateLoopWithProfile =
2666 (PreciseRotationCost && F->getFunction().hasProfileData());
2667
2668 // First check to see if there is an obviously preferable top block for the
2669 // loop. This will default to the header, but may end up as one of the
2670 // predecessors to the header if there is one which will result in strictly
2671 // fewer branches in the loop body.
2672 MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2673
2674 // If we selected just the header for the loop top, look for a potentially
2675 // profitable exit block in the event that rotating the loop can eliminate
2676 // branches by placing an exit edge at the bottom.
2677 //
2678 // Loops are processed innermost to uttermost, make sure we clear
2679 // PreferredLoopExit before processing a new loop.
2680 PreferredLoopExit = nullptr;
2681 BlockFrequency ExitFreq;
2682 if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2683 PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2684
2685 BlockChain &LoopChain = *BlockToChain[LoopTop];
2686
2687 // FIXME: This is a really lame way of walking the chains in the loop: we
2688 // walk the blocks, and use a set to prevent visiting a particular chain
2689 // twice.
2690 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2691 assert(LoopChain.UnscheduledPredecessors == 0 &&
2692 "LoopChain should not have unscheduled predecessors.");
2693 UpdatedPreds.insert(&LoopChain);
2694
2695 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2696 fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2697
2698 buildChain(LoopTop, LoopChain, &LoopBlockSet);
2699
2700 if (RotateLoopWithProfile)
2701 rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2702 else
2703 rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2704
2705 LLVM_DEBUG({
2706 // Crash at the end so we get all of the debugging output first.
2707 bool BadLoop = false;
2708 if (LoopChain.UnscheduledPredecessors) {
2709 BadLoop = true;
2710 dbgs() << "Loop chain contains a block without its preds placed!\n"
2711 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2712 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2713 }
2714 for (MachineBasicBlock *ChainBB : LoopChain) {
2715 dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2716 if (!LoopBlockSet.remove(ChainBB)) {
2717 // We don't mark the loop as bad here because there are real situations
2718 // where this can occur. For example, with an unanalyzable fallthrough
2719 // from a loop block to a non-loop block or vice versa.
2720 dbgs() << "Loop chain contains a block not contained by the loop!\n"
2721 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2722 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2723 << " Bad block: " << getBlockName(ChainBB) << "\n";
2724 }
2725 }
2726
2727 if (!LoopBlockSet.empty()) {
2728 BadLoop = true;
2729 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2730 dbgs() << "Loop contains blocks never placed into a chain!\n"
2731 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2732 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2733 << " Bad block: " << getBlockName(LoopBB) << "\n";
2734 }
2735 assert(!BadLoop && "Detected problems with the placement of this loop.");
2736 });
2737
2738 BlockWorkList.clear();
2739 EHPadWorkList.clear();
2740}
2741
2742void MachineBlockPlacement::buildCFGChains() {
2743 // Ensure that every BB in the function has an associated chain to simplify
2744 // the assumptions of the remaining algorithm.
2745 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2746 for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2747 ++FI) {
2748 MachineBasicBlock *BB = &*FI;
2749 BlockChain *Chain =
2750 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2751 // Also, merge any blocks which we cannot reason about and must preserve
2752 // the exact fallthrough behavior for.
2753 while (true) {
2754 Cond.clear();
2755 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2756 if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2757 break;
2758
2759 MachineFunction::iterator NextFI = std::next(FI);
2760 MachineBasicBlock *NextBB = &*NextFI;
2761 // Ensure that the layout successor is a viable block, as we know that
2762 // fallthrough is a possibility.
2763 assert(NextFI != FE && "Can't fallthrough past the last block.");
2764 LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2765 << getBlockName(BB) << " -> " << getBlockName(NextBB)
2766 << "\n");
2767 Chain->merge(NextBB, nullptr);
2768#ifndef NDEBUG
2769 BlocksWithUnanalyzableExits.insert(&*BB);
2770#endif
2771 FI = NextFI;
2772 BB = NextBB;
2773 }
2774 }
2775
2776 // Build any loop-based chains.
2777 PreferredLoopExit = nullptr;
2778 for (MachineLoop *L : *MLI)
2779 buildLoopChains(*L);
2780
2781 assert(BlockWorkList.empty() &&
2782 "BlockWorkList should be empty before building final chain.");
2783 assert(EHPadWorkList.empty() &&
2784 "EHPadWorkList should be empty before building final chain.");
2785
2786 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2787 for (MachineBasicBlock &MBB : *F)
2788 fillWorkLists(&MBB, UpdatedPreds);
2789
2790 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2791 buildChain(&F->front(), FunctionChain);
2792
2793#ifndef NDEBUG
2794 using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2795#endif
2796 LLVM_DEBUG({
2797 // Crash at the end so we get all of the debugging output first.
2798 bool BadFunc = false;
2799 FunctionBlockSetType FunctionBlockSet;
2800 for (MachineBasicBlock &MBB : *F)
2801 FunctionBlockSet.insert(&MBB);
2802
2803 for (MachineBasicBlock *ChainBB : FunctionChain)
2804 if (!FunctionBlockSet.erase(ChainBB)) {
2805 BadFunc = true;
2806 dbgs() << "Function chain contains a block not in the function!\n"
2807 << " Bad block: " << getBlockName(ChainBB) << "\n";
2808 }
2809
2810 if (!FunctionBlockSet.empty()) {
2811 BadFunc = true;
2812 for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2813 dbgs() << "Function contains blocks never placed into a chain!\n"
2814 << " Bad block: " << getBlockName(RemainingBB) << "\n";
2815 }
2816 assert(!BadFunc && "Detected problems with the block placement.");
2817 });
2818
2819 // Remember original layout ordering, so we can update terminators after
2820 // reordering to point to the original layout successor.
2821 SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
2822 F->getNumBlockIDs());
2823 {
2824 MachineBasicBlock *LastMBB = nullptr;
2825 for (auto &MBB : *F) {
2826 if (LastMBB != nullptr)
2827 OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
2828 LastMBB = &MBB;
2829 }
2830 OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
2831 }
2832
2833 // Splice the blocks into place.
2834 MachineFunction::iterator InsertPos = F->begin();
2835 LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2836 for (MachineBasicBlock *ChainBB : FunctionChain) {
2837 LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2838 : " ... ")
2839 << getBlockName(ChainBB) << "\n");
2840 if (InsertPos != MachineFunction::iterator(ChainBB))
2841 F->splice(InsertPos, ChainBB);
2842 else
2843 ++InsertPos;
2844
2845 // Update the terminator of the previous block.
2846 if (ChainBB == *FunctionChain.begin())
2847 continue;
2848 MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2849
2850 // FIXME: It would be awesome of updateTerminator would just return rather
2851 // than assert when the branch cannot be analyzed in order to remove this
2852 // boiler plate.
2853 Cond.clear();
2854 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2855
2856#ifndef NDEBUG
2857 if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2858 // Given the exact block placement we chose, we may actually not _need_ to
2859 // be able to edit PrevBB's terminator sequence, but not being _able_ to
2860 // do that at this point is a bug.
2861 assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2862 !PrevBB->canFallThrough()) &&
2863 "Unexpected block with un-analyzable fallthrough!");
2864 Cond.clear();
2865 TBB = FBB = nullptr;
2866 }
2867#endif
2868
2869 // The "PrevBB" is not yet updated to reflect current code layout, so,
2870 // o. it may fall-through to a block without explicit "goto" instruction
2871 // before layout, and no longer fall-through it after layout; or
2872 // o. just opposite.
2873 //
2874 // analyzeBranch() may return erroneous value for FBB when these two
2875 // situations take place. For the first scenario FBB is mistakenly set NULL;
2876 // for the 2nd scenario, the FBB, which is expected to be NULL, is
2877 // mistakenly pointing to "*BI".
2878 // Thus, if the future change needs to use FBB before the layout is set, it
2879 // has to correct FBB first by using the code similar to the following:
2880 //
2881 // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2882 // PrevBB->updateTerminator();
2883 // Cond.clear();
2884 // TBB = FBB = nullptr;
2885 // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2886 // // FIXME: This should never take place.
2887 // TBB = FBB = nullptr;
2888 // }
2889 // }
2890 if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2891 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2892 }
2893 }
2894
2895 // Fixup the last block.
2896 Cond.clear();
2897 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2898 if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
2899 MachineBasicBlock *PrevBB = &F->back();
2900 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2901 }
2902
2903 BlockWorkList.clear();
2904 EHPadWorkList.clear();
2905}
2906
2907void MachineBlockPlacement::optimizeBranches() {
2908 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2910
2911 // Now that all the basic blocks in the chain have the proper layout,
2912 // make a final call to analyzeBranch with AllowModify set.
2913 // Indeed, the target may be able to optimize the branches in a way we
2914 // cannot because all branches may not be analyzable.
2915 // E.g., the target may be able to remove an unconditional branch to
2916 // a fallthrough when it occurs after predicated terminators.
2917 for (MachineBasicBlock *ChainBB : FunctionChain) {
2918 Cond.clear();
2919 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2920 if (TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true))
2921 continue;
2922 if (!TBB || !FBB || Cond.empty())
2923 continue;
2924 // If we are optimizing for size we do not consider the runtime performance.
2925 // Instead, we retain the original branch condition so we have more uniform
2926 // instructions which will benefit ICF.
2927 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()))
2928 continue;
2929 // If ChainBB has a two-way branch, try to re-order the branches
2930 // such that we branch to the successor with higher probability first.
2931 if (MBPI->getEdgeProbability(ChainBB, TBB) >=
2932 MBPI->getEdgeProbability(ChainBB, FBB))
2933 continue;
2935 continue;
2936 LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2937 << getBlockName(ChainBB) << "\n");
2938 LLVM_DEBUG(dbgs() << " " << getBlockName(TBB) << " < " << getBlockName(FBB)
2939 << "\n");
2940 auto Dl = ChainBB->findBranchDebugLoc();
2941 TII->removeBranch(*ChainBB);
2942 TII->insertBranch(*ChainBB, FBB, TBB, Cond, Dl);
2943 }
2944}
2945
2946void MachineBlockPlacement::alignBlocks() {
2947 // Walk through the backedges of the function now that we have fully laid out
2948 // the basic blocks and align the destination of each backedge. We don't rely
2949 // exclusively on the loop info here so that we can align backedges in
2950 // unnatural CFGs and backedges that were introduced purely because of the
2951 // loop rotations done during this layout pass.
2953 if (F->getFunction().hasMinSize() ||
2954 (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
2955 return;
2956 }
2957
2958 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2959 // Empty chain.
2960 if (FunctionChain.begin() == FunctionChain.end())
2961 return;
2962
2963 const BranchProbability ColdProb(1, 5); // 20%
2964 BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2965 BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2966 for (MachineBasicBlock *ChainBB : FunctionChain) {
2967 if (ChainBB == *FunctionChain.begin())
2968 continue;
2969
2970 // Don't align non-looping basic blocks. These are unlikely to execute
2971 // enough times to matter in practice. Note that we'll still handle
2972 // unnatural CFGs inside of a natural outer loop (the common case) and
2973 // rotated loops.
2974 MachineLoop *L = MLI->getLoopFor(ChainBB);
2975 if (!L)
2976 continue;
2977
2978 const Align TLIAlign = TLI->getPrefLoopAlignment(L);
2979 unsigned MDAlign = 1;
2980 MDNode *LoopID = L->getLoopID();
2981 if (LoopID) {
2982 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
2983 MDNode *MD = dyn_cast<MDNode>(MDO);
2984 if (MD == nullptr)
2985 continue;
2986 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
2987 if (S == nullptr)
2988 continue;
2989 if (S->getString() == "llvm.loop.align") {
2990 assert(MD->getNumOperands() == 2 &&
2991 "per-loop align metadata should have two operands.");
2992 MDAlign =
2993 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
2994 assert(MDAlign >= 1 && "per-loop align value must be positive.");
2995 }
2996 }
2997 }
2998
2999 // Use max of the TLIAlign and MDAlign
3000 const Align LoopAlign = std::max(TLIAlign, Align(MDAlign));
3001 if (LoopAlign == 1)
3002 continue; // Don't care about loop alignment.
3003
3004 // If the block is cold relative to the function entry don't waste space
3005 // aligning it.
3006 BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
3007 if (Freq < WeightedEntryFreq)
3008 continue;
3009
3010 // If the block is cold relative to its loop header, don't align it
3011 // regardless of what edges into the block exist.
3012 MachineBasicBlock *LoopHeader = L->getHeader();
3013 BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
3014 if (Freq < (LoopHeaderFreq * ColdProb))
3015 continue;
3016
3017 // If the global profiles indicates so, don't align it.
3018 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
3019 !TLI->alignLoopsWithOptSize())
3020 continue;
3021
3022 // Check for the existence of a non-layout predecessor which would benefit
3023 // from aligning this block.
3024 MachineBasicBlock *LayoutPred =
3025 &*std::prev(MachineFunction::iterator(ChainBB));
3026
3027 auto DetermineMaxAlignmentPadding = [&]() {
3028 // Set the maximum bytes allowed to be emitted for alignment.
3029 unsigned MaxBytes;
3030 if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
3032 else
3033 MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
3034 ChainBB->setMaxBytesForAlignment(MaxBytes);
3035 };
3036
3037 // Force alignment if all the predecessors are jumps. We already checked
3038 // that the block isn't cold above.
3039 if (!LayoutPred->isSuccessor(ChainBB)) {
3040 ChainBB->setAlignment(LoopAlign);
3041 DetermineMaxAlignmentPadding();
3042 continue;
3043 }
3044
3045 // Align this block if the layout predecessor's edge into this block is
3046 // cold relative to the block. When this is true, other predecessors make up
3047 // all of the hot entries into the block and thus alignment is likely to be
3048 // important.
3049 BranchProbability LayoutProb =
3050 MBPI->getEdgeProbability(LayoutPred, ChainBB);
3051 BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
3052 if (LayoutEdgeFreq <= (Freq * ColdProb)) {
3053 ChainBB->setAlignment(LoopAlign);
3054 DetermineMaxAlignmentPadding();
3055 }
3056 }
3057
3058 const bool HasMaxBytesOverride =
3059 MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
3060
3061 if (AlignAllBlock)
3062 // Align all of the blocks in the function to a specific alignment.
3063 for (MachineBasicBlock &MBB : *F) {
3064 if (HasMaxBytesOverride)
3067 else
3069 }
3070 else if (AlignAllNonFallThruBlocks) {
3071 // Align all of the blocks that have no fall-through predecessors to a
3072 // specific alignment.
3073 for (auto MBI = std::next(F->begin()), MBE = F->end(); MBI != MBE; ++MBI) {
3074 auto LayoutPred = std::prev(MBI);
3075 if (!LayoutPred->isSuccessor(&*MBI)) {
3076 if (HasMaxBytesOverride)
3077 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
3079 else
3080 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
3081 }
3082 }
3083 }
3084}
3085
3086/// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
3087/// it was duplicated into its chain predecessor and removed.
3088/// \p BB - Basic block that may be duplicated.
3089///
3090/// \p LPred - Chosen layout predecessor of \p BB.
3091/// Updated to be the chain end if LPred is removed.
3092/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3093/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3094/// Used to identify which blocks to update predecessor
3095/// counts.
3096/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3097/// chosen in the given order due to unnatural CFG
3098/// only needed if \p BB is removed and
3099/// \p PrevUnplacedBlockIt pointed to \p BB.
3100/// @return true if \p BB was removed.
3101bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
3103 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
3104 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3105 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt) {
3106 bool Removed, DuplicatedToLPred;
3107 bool DuplicatedToOriginalLPred;
3108 Removed = maybeTailDuplicateBlock(
3109 BB, LPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3110 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3111 if (!Removed)
3112 return false;
3113 DuplicatedToOriginalLPred = DuplicatedToLPred;
3114 // Iteratively try to duplicate again. It can happen that a block that is
3115 // duplicated into is still small enough to be duplicated again.
3116 // No need to call markBlockSuccessors in this case, as the blocks being
3117 // duplicated from here on are already scheduled.
3118 while (DuplicatedToLPred && Removed) {
3119 MachineBasicBlock *DupBB, *DupPred;
3120 // The removal callback causes Chain.end() to be updated when a block is
3121 // removed. On the first pass through the loop, the chain end should be the
3122 // same as it was on function entry. On subsequent passes, because we are
3123 // duplicating the block at the end of the chain, if it is removed the
3124 // chain will have shrunk by one block.
3125 BlockChain::iterator ChainEnd = Chain.end();
3126 DupBB = *(--ChainEnd);
3127 // Now try to duplicate again.
3128 if (ChainEnd == Chain.begin())
3129 break;
3130 DupPred = *std::prev(ChainEnd);
3131 Removed = maybeTailDuplicateBlock(
3132 DupBB, DupPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3133 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3134 }
3135 // If BB was duplicated into LPred, it is now scheduled. But because it was
3136 // removed, markChainSuccessors won't be called for its chain. Instead we
3137 // call markBlockSuccessors for LPred to achieve the same effect. This must go
3138 // at the end because repeating the tail duplication can increase the number
3139 // of unscheduled predecessors.
3140 LPred = *std::prev(Chain.end());
3141 if (DuplicatedToOriginalLPred)
3142 markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
3143 return true;
3144}
3145
3146/// Tail duplicate \p BB into (some) predecessors if profitable.
3147/// \p BB - Basic block that may be duplicated
3148/// \p LPred - Chosen layout predecessor of \p BB
3149/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3150/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3151/// Used to identify which blocks to update predecessor
3152/// counts.
3153/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3154/// chosen in the given order due to unnatural CFG
3155/// only needed if \p BB is removed and
3156/// \p PrevUnplacedBlockIt pointed to \p BB.
3157/// \p DuplicatedToLPred - True if the block was duplicated into LPred.
3158/// \return - True if the block was duplicated into all preds and removed.
3159bool MachineBlockPlacement::maybeTailDuplicateBlock(
3160 MachineBasicBlock *BB, MachineBasicBlock *LPred, BlockChain &Chain,
3161 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3162 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
3163 bool &DuplicatedToLPred) {
3164 DuplicatedToLPred = false;
3165 if (!shouldTailDuplicate(BB))
3166 return false;
3167
3168 LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
3169 << "\n");
3170
3171 // This has to be a callback because none of it can be done after
3172 // BB is deleted.
3173 bool Removed = false;
3174 auto RemovalCallback = [&](MachineBasicBlock *RemBB) {
3175 // Signal to outer function
3176 Removed = true;
3177
3178 // Conservative default.
3179 bool InWorkList = true;
3180 // Remove from the Chain and Chain Map
3181 if (BlockToChain.count(RemBB)) {
3182 BlockChain *Chain = BlockToChain[RemBB];
3183 InWorkList = Chain->UnscheduledPredecessors == 0;
3184 Chain->remove(RemBB);
3185 BlockToChain.erase(RemBB);
3186 }
3187
3188 // Handle the unplaced block iterator
3189 if (&(*PrevUnplacedBlockIt) == RemBB) {
3190 PrevUnplacedBlockIt++;
3191 }
3192
3193 // Handle the Work Lists
3194 if (InWorkList) {
3195 SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
3196 if (RemBB->isEHPad())
3197 RemoveList = EHPadWorkList;
3198 llvm::erase(RemoveList, RemBB);
3199 }
3200
3201 // Handle the filter set
3202 if (BlockFilter) {
3203 auto It = llvm::find(*BlockFilter, RemBB);
3204 // Erase RemBB from BlockFilter, and keep PrevUnplacedBlockInFilterIt
3205 // pointing to the same element as before.
3206 if (It != BlockFilter->end()) {
3207 if (It < PrevUnplacedBlockInFilterIt) {
3208 const MachineBasicBlock *PrevBB = *PrevUnplacedBlockInFilterIt;
3209 // BlockFilter is a SmallVector so all elements after RemBB are
3210 // shifted to the front by 1 after its deletion.
3211 auto Distance = PrevUnplacedBlockInFilterIt - It - 1;
3212 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It) + Distance;
3213 assert(*PrevUnplacedBlockInFilterIt == PrevBB);
3214 (void)PrevBB;
3215 } else if (It == PrevUnplacedBlockInFilterIt)
3216 // The block pointed by PrevUnplacedBlockInFilterIt is erased, we
3217 // have to set it to the next element.
3218 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It);
3219 else
3220 BlockFilter->erase(It);
3221 }
3222 }
3223
3224 // Remove the block from loop info.
3225 MLI->removeBlock(RemBB);
3226 if (RemBB == PreferredLoopExit)
3227 PreferredLoopExit = nullptr;
3228
3229 LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: " << getBlockName(RemBB)
3230 << "\n");
3231 };
3232 auto RemovalCallbackRef =
3234
3236 bool IsSimple = TailDup.isSimpleBB(BB);
3238 SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
3239 if (F->getFunction().hasProfileData()) {
3240 // We can do partial duplication with precise profile information.
3241 findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
3242 if (CandidatePreds.size() == 0)
3243 return false;
3244 if (CandidatePreds.size() < BB->pred_size())
3245 CandidatePtr = &CandidatePreds;
3246 }
3247 TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
3248 &RemovalCallbackRef, CandidatePtr);
3249
3250 // Update UnscheduledPredecessors to reflect tail-duplication.
3251 DuplicatedToLPred = false;
3252 for (MachineBasicBlock *Pred : DuplicatedPreds) {
3253 // We're only looking for unscheduled predecessors that match the filter.
3254 BlockChain *PredChain = BlockToChain[Pred];
3255 if (Pred == LPred)
3256 DuplicatedToLPred = true;
3257 if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred)) ||
3258 PredChain == &Chain)
3259 continue;
3260 for (MachineBasicBlock *NewSucc : Pred->successors()) {
3261 if (BlockFilter && !BlockFilter->count(NewSucc))
3262 continue;
3263 BlockChain *NewChain = BlockToChain[NewSucc];
3264 if (NewChain != &Chain && NewChain != PredChain)
3265 NewChain->UnscheduledPredecessors++;
3266 }
3267 }
3268 return Removed;
3269}
3270
3271// Count the number of actual machine instructions.
3273 uint64_t InstrCount = 0;
3274 for (MachineInstr &MI : *MBB) {
3275 if (!MI.isPHI() && !MI.isMetaInstruction())
3276 InstrCount += 1;
3277 }
3278 return InstrCount;
3279}
3280
3281// The size cost of duplication is the instruction size of the duplicated block.
3282// So we should scale the threshold accordingly. But the instruction size is not
3283// available on all targets, so we use the number of instructions instead.
3284BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3285 return BlockFrequency(DupThreshold.getFrequency() * countMBBInstruction(BB));
3286}
3287
3288// Returns true if BB is Pred's best successor.
3289bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3290 MachineBasicBlock *Pred,
3291 BlockFilterSet *BlockFilter) {
3292 if (BB == Pred)
3293 return false;
3294 if (BlockFilter && !BlockFilter->count(Pred))
3295 return false;
3296 BlockChain *PredChain = BlockToChain[Pred];
3297 if (PredChain && (Pred != *std::prev(PredChain->end())))
3298 return false;
3299
3300 // Find the successor with largest probability excluding BB.
3302 for (MachineBasicBlock *Succ : Pred->successors())
3303 if (Succ != BB) {
3304 if (BlockFilter && !BlockFilter->count(Succ))
3305 continue;
3306 BlockChain *SuccChain = BlockToChain[Succ];
3307 if (SuccChain && (Succ != *SuccChain->begin()))
3308 continue;
3309 BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
3310 if (SuccProb > BestProb)
3311 BestProb = SuccProb;
3312 }
3313
3314 BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
3315 if (BBProb <= BestProb)
3316 return false;
3317
3318 // Compute the number of reduced taken branches if Pred falls through to BB
3319 // instead of another successor. Then compare it with threshold.
3320 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3321 BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3322 return Gain > scaleThreshold(BB);
3323}
3324
3325// Find out the predecessors of BB and BB can be beneficially duplicated into
3326// them.
3327void MachineBlockPlacement::findDuplicateCandidates(
3329 BlockFilterSet *BlockFilter) {
3330 MachineBasicBlock *Fallthrough = nullptr;
3331 BranchProbability DefaultBranchProb = BranchProbability::getZero();
3332 BlockFrequency BBDupThreshold(scaleThreshold(BB));
3335
3336 // Sort for highest frequency.
3337 auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3338 return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
3339 };
3340 auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3341 return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
3342 };
3343 llvm::stable_sort(Succs, CmpSucc);
3344 llvm::stable_sort(Preds, CmpPred);
3345
3346 auto SuccIt = Succs.begin();
3347 if (SuccIt != Succs.end()) {
3348 DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
3349 }
3350
3351 // For each predecessors of BB, compute the benefit of duplicating BB,
3352 // if it is larger than the threshold, add it into Candidates.
3353 //
3354 // If we have following control flow.
3355 //
3356 // PB1 PB2 PB3 PB4
3357 // \ | / /\
3358 // \ | / / \
3359 // \ |/ / \
3360 // BB----/ OB
3361 // /\
3362 // / \
3363 // SB1 SB2
3364 //
3365 // And it can be partially duplicated as
3366 //
3367 // PB2+BB
3368 // | PB1 PB3 PB4
3369 // | | / /\
3370 // | | / / \
3371 // | |/ / \
3372 // | BB----/ OB
3373 // |\ /|
3374 // | X |
3375 // |/ \|
3376 // SB2 SB1
3377 //
3378 // The benefit of duplicating into a predecessor is defined as
3379 // Orig_taken_branch - Duplicated_taken_branch
3380 //
3381 // The Orig_taken_branch is computed with the assumption that predecessor
3382 // jumps to BB and the most possible successor is laid out after BB.
3383 //
3384 // The Duplicated_taken_branch is computed with the assumption that BB is
3385 // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3386 // SB2 for PB2 in our case). If there is no available successor, the combined
3387 // block jumps to all BB's successor, like PB3 in this example.
3388 //
3389 // If a predecessor has multiple successors, so BB can't be duplicated into
3390 // it. But it can beneficially fall through to BB, and duplicate BB into other
3391 // predecessors.
3392 for (MachineBasicBlock *Pred : Preds) {
3393 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3394
3395 if (!TailDup.canTailDuplicate(BB, Pred)) {
3396 // BB can't be duplicated into Pred, but it is possible to be layout
3397 // below Pred.
3398 if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3399 Fallthrough = Pred;
3400 if (SuccIt != Succs.end())
3401 SuccIt++;
3402 }
3403 continue;
3404 }
3405
3406 BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3407 BlockFrequency DupCost;
3408 if (SuccIt == Succs.end()) {
3409 // Jump to all successors;
3410 if (Succs.size() > 0)
3411 DupCost += PredFreq;
3412 } else {
3413 // Fallthrough to *SuccIt, jump to all other successors;
3414 DupCost += PredFreq;
3415 DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
3416 }
3417
3418 assert(OrigCost >= DupCost);
3419 OrigCost -= DupCost;
3420 if (OrigCost > BBDupThreshold) {
3421 Candidates.push_back(Pred);
3422 if (SuccIt != Succs.end())
3423 SuccIt++;
3424 }
3425 }
3426
3427 // No predecessors can optimally fallthrough to BB.
3428 // So we can change one duplication into fallthrough.
3429 if (!Fallthrough) {
3430 if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
3431 Candidates[0] = Candidates.back();
3432 Candidates.pop_back();
3433 }
3434 }
3435}
3436
3437void MachineBlockPlacement::initTailDupThreshold() {
3438 DupThreshold = BlockFrequency(0);
3439 if (F->getFunction().hasProfileData()) {
3440 // We prefer to use prifile count.
3441 uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
3442 if (HotThreshold != UINT64_MAX) {
3443 UseProfileCount = true;
3444 DupThreshold =
3445 BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100);
3446 } else {
3447 // Profile count is not available, we can use block frequency instead.
3448 BlockFrequency MaxFreq = BlockFrequency(0);
3449 for (MachineBasicBlock &MBB : *F) {
3450 BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
3451 if (Freq > MaxFreq)
3452 MaxFreq = Freq;
3453 }
3454
3455 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
3456 DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
3457 UseProfileCount = false;
3458 }
3459 }
3460
3461 TailDupSize = TailDupPlacementThreshold;
3462 // If only the aggressive threshold is explicitly set, use it.
3463 if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3464 TailDupPlacementThreshold.getNumOccurrences() == 0)
3466
3467 // For aggressive optimization, we can adjust some thresholds to be less
3468 // conservative.
3469 if (PassConfig->getOptLevel() >= CodeGenOptLevel::Aggressive) {
3470 // At O3 we should be more willing to copy blocks for tail duplication. This
3471 // increases size pressure, so we only do it at O3
3472 // Do this unless only the regular threshold is explicitly set.
3473 if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3474 TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3476 }
3477
3478 // If there's no threshold provided through options, query the target
3479 // information for a threshold instead.
3480 if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
3481 (PassConfig->getOptLevel() < CodeGenOptLevel::Aggressive ||
3482 TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
3483 TailDupSize = TII->getTailDuplicateSize(PassConfig->getOptLevel());
3484}
3485
3486bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
3487 if (skipFunction(MF.getFunction()))
3488 return false;
3489
3490 // Check for single-block functions and skip them.
3491 if (std::next(MF.begin()) == MF.end())
3492 return false;
3493
3494 F = &MF;
3495 MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
3496 MBFI = std::make_unique<MBFIWrapper>(
3497 getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI());
3498 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3499 TII = MF.getSubtarget().getInstrInfo();
3500 TLI = MF.getSubtarget().getTargetLowering();
3501 MPDT = nullptr;
3502 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
3503 PassConfig = &getAnalysis<TargetPassConfig>();
3504
3505 // Initialize PreferredLoopExit to nullptr here since it may never be set if
3506 // there are no MachineLoops.
3507 PreferredLoopExit = nullptr;
3508
3509 assert(BlockToChain.empty() &&
3510 "BlockToChain map should be empty before starting placement.");
3511 assert(ComputedEdges.empty() &&
3512 "Computed Edge map should be empty before starting placement.");
3513
3514 // Initialize tail duplication thresholds.
3515 initTailDupThreshold();
3516
3517 const bool OptForSize =
3518 llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
3519 // Determine whether to use ext-tsp for perf/size optimization. The method
3520 // is beneficial only for instances with at least 3 basic blocks and it can be
3521 // disabled for huge functions (exceeding a certain size).
3522 bool UseExtTspForPerf = false;
3523 bool UseExtTspForSize = false;
3524 if (3 <= MF.size() && MF.size() <= ExtTspBlockPlacementMaxBlocks) {
3525 UseExtTspForPerf =
3528 UseExtTspForSize = OptForSize && ApplyExtTspForSize;
3529 }
3530
3531 // Apply tail duplication.
3532 if (allowTailDupPlacement()) {
3533 MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3534 if (OptForSize)
3535 TailDupSize = 1;
3536 const bool PreRegAlloc = false;
3537 TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
3538 /* LayoutMode */ true, TailDupSize);
3539 if (!UseExtTspForSize)
3540 precomputeTriangleChains();
3541 }
3542
3543 // Run the main block placement.
3544 if (!UseExtTspForSize)
3545 buildCFGChains();
3546
3547 // Changing the layout can create new tail merging opportunities.
3548 // TailMerge can create jump into if branches that make CFG irreducible for
3549 // HW that requires structured CFG.
3550 const bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3551 PassConfig->getEnableTailMerge() &&
3552 BranchFoldPlacement && MF.size() > 3;
3553 // No tail merging opportunities if the block number is less than four.
3554 if (EnableTailMerge) {
3555 const unsigned TailMergeSize = TailDupSize + 1;
3556 BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
3557 *MBFI, *MBPI, PSI, TailMergeSize);
3558
3559 if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
3560 /*AfterPlacement=*/true)) {
3561 // Must redo the post-dominator tree if blocks were changed.
3562 if (MPDT)
3563 MPDT->recalculate(MF);
3564 if (!UseExtTspForSize) {
3565 // Redo the layout if tail merging creates/removes/moves blocks.
3566 BlockToChain.clear();
3567 ComputedEdges.clear();
3568 ChainAllocator.DestroyAll();
3569 buildCFGChains();
3570 }
3571 }
3572 }
3573
3574 // Apply a post-processing optimizing block placement:
3575 // - find a new placement and modify the layout of the blocks in the function;
3576 // - re-create CFG chains so that we can optimizeBranches and alignBlocks.
3577 if (UseExtTspForPerf || UseExtTspForSize) {
3578 assert(
3579 !(UseExtTspForPerf && UseExtTspForSize) &&
3580 "UseExtTspForPerf and UseExtTspForSize can not be set simultaneously");
3581 applyExtTsp(/*OptForSize=*/UseExtTspForSize);
3582 createCFGChainExtTsp();
3583 }
3584
3585 optimizeBranches();
3586 alignBlocks();
3587
3588 BlockToChain.clear();
3589 ComputedEdges.clear();
3590 ChainAllocator.DestroyAll();
3591
3592 // View the function.
3594 (ViewBlockFreqFuncName.empty() ||
3595 F->getFunction().getName() == ViewBlockFreqFuncName)) {
3597 MF.RenumberBlocks();
3598 MBFI->view("MBP." + MF.getName(), false);
3599 }
3600
3601 // We always return true as we have no way to track whether the final order
3602 // differs from the original order.
3603 return true;
3604}
3605
3606void MachineBlockPlacement::applyExtTsp(bool OptForSize) {
3607 // Prepare data; blocks are indexed by their index in the current ordering.
3609 BlockIndex.reserve(F->size());
3610 std::vector<const MachineBasicBlock *> CurrentBlockOrder;
3611 CurrentBlockOrder.reserve(F->size());
3612 size_t NumBlocks = 0;
3613 for (const MachineBasicBlock &MBB : *F) {
3614 BlockIndex[&MBB] = NumBlocks++;
3615 CurrentBlockOrder.push_back(&MBB);
3616 }
3617
3618 SmallVector<uint64_t, 0> BlockCounts(F->size());
3619 SmallVector<uint64_t, 0> BlockSizes(F->size());
3621 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
3623 for (MachineBasicBlock &MBB : *F) {
3624 // Getting the block frequency.
3625 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3626 BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency();
3627 // Getting the block size:
3628 // - approximate the size of an instruction by 4 bytes, and
3629 // - ignore debug instructions.
3630 // Note: getting the exact size of each block is target-dependent and can be
3631 // done by extending the interface of MCCodeEmitter. Experimentally we do
3632 // not see a perf improvement with the exact block sizes.
3633 auto NonDbgInsts =
3635 size_t NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
3636 BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3637
3638 // Getting jump frequencies.
3639 if (OptForSize) {
3640 Cond.clear();
3641 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
3642 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3643 continue;
3644
3645 const MachineBasicBlock *FTB = MBB.getFallThrough();
3646 // Succs is a collection of distinct destinations of the block reachable
3647 // from MBB via a jump instruction; initialize the list using the three
3648 // (non-necessarily distinct) blocks, FTB, TBB, and FBB.
3649 Succs.clear();
3650 if (TBB && TBB != FTB)
3651 Succs.push_back(TBB);
3652 if (FBB && FBB != FTB)
3653 Succs.push_back(FBB);
3654 if (FTB)
3655 Succs.push_back(FTB);
3656 // Absolute magnitude of non-zero counts does not matter for the
3657 // optimization; prioritize slightly jumps with a single successor, since
3658 // the corresponding jump instruction will be removed from the binary.
3659 const uint64_t Freq = Succs.size() == 1 ? 110 : 100;
3660 for (const MachineBasicBlock *Succ : Succs)
3661 JumpCounts.push_back({BlockIndex[&MBB], BlockIndex[Succ], Freq});
3662 } else {
3663 for (MachineBasicBlock *Succ : MBB.successors()) {
3664 auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3665 BlockFrequency JumpFreq = BlockFreq * EP;
3666 JumpCounts.push_back(
3667 {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
3668 }
3669 }
3670 }
3671
3672 LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
3673 << " with profile = " << F->getFunction().hasProfileData()
3674 << " (" << F->getName() << ")" << "\n");
3675
3676 const double OrgScore = calcExtTspScore(BlockSizes, JumpCounts);
3677 LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", OrgScore));
3678
3679 // Run the layout algorithm.
3680 auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
3681 std::vector<const MachineBasicBlock *> NewBlockOrder;
3682 NewBlockOrder.reserve(F->size());
3683 for (uint64_t Node : NewOrder) {
3684 NewBlockOrder.push_back(CurrentBlockOrder[Node]);
3685 }
3686 const double OptScore = calcExtTspScore(NewOrder, BlockSizes, JumpCounts);
3687 LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", OptScore));
3688
3689 // If the optimization is unsuccessful, fall back to the original block order.
3690 if (OptForSize && OrgScore > OptScore)
3691 assignBlockOrder(CurrentBlockOrder);
3692 else
3693 assignBlockOrder(NewBlockOrder);
3694}
3695
3696void MachineBlockPlacement::assignBlockOrder(
3697 const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
3698 assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
3699 F->RenumberBlocks();
3700 // At this point, we possibly removed blocks from the function, so we can't
3701 // renumber the domtree. At this point, we don't need it anymore, though.
3702 // TODO: move this to the point where the dominator tree is actually
3703 // invalidated (i.e., where blocks are removed without updating the domtree).
3704 MPDT = nullptr;
3705
3706 bool HasChanges = false;
3707 for (size_t I = 0; I < NewBlockOrder.size(); I++) {
3708 if (NewBlockOrder[I] != F->getBlockNumbered(I)) {
3709 HasChanges = true;
3710 break;
3711 }
3712 }
3713 // Stop early if the new block order is identical to the existing one.
3714 if (!HasChanges)
3715 return;
3716
3717 SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs());
3718 for (auto &MBB : *F) {
3719 PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
3720 }
3721
3722 // Sort basic blocks in the function according to the computed order.
3724 for (const MachineBasicBlock *MBB : NewBlockOrder) {
3725 NewIndex[MBB] = NewIndex.size();
3726 }
3727 F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) {
3728 return NewIndex[&L] < NewIndex[&R];
3729 });
3730
3731 // Update basic block branches by inserting explicit fallthrough branches
3732 // when required and re-optimize branches when possible.
3733 const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
3735 for (auto &MBB : *F) {
3736 MachineFunction::iterator NextMBB = std::next(MBB.getIterator());
3738 auto *FTMBB = PrevFallThroughs[MBB.getNumber()];
3739 // If this block had a fallthrough before we need an explicit unconditional
3740 // branch to that block if the fallthrough block is not adjacent to the
3741 // block in the new order.
3742 if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) {
3743 TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
3744 }
3745
3746 // It might be possible to optimize branches by flipping the condition.
3747 Cond.clear();
3748 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
3749 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3750 continue;
3751 MBB.updateTerminator(FTMBB);
3752 }
3753}
3754
3755void MachineBlockPlacement::createCFGChainExtTsp() {
3756 BlockToChain.clear();
3757 ComputedEdges.clear();
3758 ChainAllocator.DestroyAll();
3759
3760 MachineBasicBlock *HeadBB = &F->front();
3761 BlockChain *FunctionChain =
3762 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB);
3763
3764 for (MachineBasicBlock &MBB : *F) {
3765 if (HeadBB == &MBB)
3766 continue; // Ignore head of the chain
3767 FunctionChain->merge(&MBB, nullptr);
3768 }
3769}
3770
3771namespace {
3772
3773/// A pass to compute block placement statistics.
3774///
3775/// A separate pass to compute interesting statistics for evaluating block
3776/// placement. This is separate from the actual placement pass so that they can
3777/// be computed in the absence of any placement transformations or when using
3778/// alternative placement strategies.
3779class MachineBlockPlacementStats : public MachineFunctionPass {
3780 /// A handle to the branch probability pass.
3781 const MachineBranchProbabilityInfo *MBPI;
3782
3783 /// A handle to the function-wide block frequency pass.
3784 const MachineBlockFrequencyInfo *MBFI;
3785
3786public:
3787 static char ID; // Pass identification, replacement for typeid
3788
3789 MachineBlockPlacementStats() : MachineFunctionPass(ID) {
3791 }
3792
3793 bool runOnMachineFunction(MachineFunction &F) override;
3794
3795 void getAnalysisUsage(AnalysisUsage &AU) const override {
3798 AU.setPreservesAll();
3800 }
3801};
3802
3803} // end anonymous namespace
3804
3805char MachineBlockPlacementStats::ID = 0;
3806
3807char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStats::ID;
3808
3809INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
3810 "Basic Block Placement Stats", false, false)
3813INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
3815
3816bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
3817 // Check for single-block functions and skip them.
3818 if (std::next(F.begin()) == F.end())
3819 return false;
3820
3821 if (!isFunctionInPrintList(F.getName()))
3822 return false;
3823
3824 MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
3825 MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
3826
3827 for (MachineBasicBlock &MBB : F) {
3828 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3829 Statistic &NumBranches =
3830 (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3831 Statistic &BranchTakenFreq =
3832 (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3833 for (MachineBasicBlock *Succ : MBB.successors()) {
3834 // Skip if this successor is a fallthrough.
3835 if (MBB.isLayoutSuccessor(Succ))
3836 continue;
3837
3838 BlockFrequency EdgeFreq =
3839 BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3840 ++NumBranches;
3841 BranchTakenFreq += EdgeFreq.getFrequency();
3842 }
3843 }
3844
3845 return false;
3846}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
MachineBasicBlock & MBB
This file defines the BumpPtrAllocator interface.
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Declares methods and data structures for code layout algorithms.
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
static unsigned InstrCount
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
block placement Basic Block Placement Stats
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
Branch Probability Basic Block Placement
static cl::opt< unsigned > ExtTspBlockPlacementMaxBlocks("ext-tsp-block-placement-max-blocks", cl::desc("Maximum number of basic blocks in a function to run ext-TSP " "block placement."), cl::init(UINT_MAX), cl::Hidden)
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, BlockFrequency EntryFreq)
Compare 2 BlockFrequency's with a small penalty for A.
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > MaxBytesForAlignmentOverride("max-bytes-for-alignment", cl::desc("Forces the maximum bytes allowed to be emitted when padding for " "alignment"), cl::init(0), cl::Hidden)
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static cl::opt< unsigned > TailDupProfilePercentThreshold("tail-dup-profile-percent-threshold", cl::desc("If profile count information is used in tail duplication cost " "model, the gained fall through number from tail duplication " "should be at least this percent of hot count."), cl::init(50), cl::Hidden)
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
static cl::opt< bool > ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden, cl::desc("Use ext-tsp for size-aware block placement."))
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock * > &Successors)
Check if BB has exactly the successors in Successors.
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
static uint64_t countMBBInstruction(MachineBasicBlock *MBB)
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
static cl::opt< bool > RenumberBlocksBeforeView("renumber-blocks-before-view", cl::desc("If true, basic blocks are re-numbered before MBP layout is printed " "into a dot graph. Only used when a function is being printed."), cl::init(false), cl::Hidden)
#define DEBUG_TYPE
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
unify loop Fixup each natural loop to have a single exit block
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
static BlockFrequency max()
Returns the maximum possible frequency, the saturation value.
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
static BranchProbability getOne()
uint32_t getNumerator() const
BranchProbability getCompl() const
static BranchProbability getZero()
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition: Function.h:329
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
Remove the branching code at the end of the specific MBB.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
Reverses the branch condition of the specified condition list, returning false on success and true if...
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
Insert branch code into the end of the specified MachineBasicBlock.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Metadata node.
Definition: Metadata.h:1069
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1428
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:891
A single uniqued string.
Definition: Metadata.h:720
StringRef getString() const
Definition: Metadata.cpp:616
unsigned pred_size() const
bool isEHPad() const
Returns true if the block is a landing pad.
instr_iterator instr_begin()
MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
void updateTerminator(MachineBasicBlock *PreviousLayoutSuccessor)
Update the terminator instructions in block to account for changes to block layout which may have bee...
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
unsigned succ_size() const
void setAlignment(Align A)
Set alignment of the basic block.
bool isEntryBlock() const
Returns true if this is the entry block of the function.
succ_reverse_iterator succ_rbegin()
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
DebugLoc findBranchDebugLoc()
Find and return the merged DebugLoc of the branch instructions of the block.
iterator_range< succ_iterator > successors()
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
iterator_range< pred_iterator > predecessors()
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
unsigned size() const
Function & getFunction()
Return the LLVM function that this machine code represents.
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Representation of each machine instruction.
Definition: MachineInstr.h:69
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:310
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
uint64_t getOrCompHotCountThreshold() const
Returns HotCountThreshold if set.
typename vector_type::const_iterator iterator
Definition: SetVector.h:69
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:578
typename SuperClass::iterator iterator
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:389
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:439
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:410
Utility class to perform tail duplication.
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, MBFIWrapper *MBFI, ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock * > *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr, SmallVectorImpl< MachineBasicBlock * > *CandidatePtr=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
TargetInstrInfo - Interface to description of machine instruction set.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
bool requiresStructuredCFG() const
Target-Independent Code Generator Pass Configuration Options.
bool getEnableTailMerge() const
CodeGenOptLevel getOptLevel() const
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetLowering * getTargetLowering() const
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:132
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
#define UINT64_MAX
Definition: DataTypes.h:77
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
double calcExtTspScore(ArrayRef< uint64_t > Order, ArrayRef< uint64_t > NodeSizes, ArrayRef< EdgeCount > EdgeCounts)
Estimate the "quality" of a given node order in CFG.
std::vector< uint64_t > computeExtTspLayout(ArrayRef< uint64_t > NodeSizes, ArrayRef< uint64_t > NodeCounts, ArrayRef< EdgeCount > EdgeCounts)
Find a layout of nodes (basic blocks) of a given CFG optimizing jump locality and thus processor I-ca...
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:456
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void stable_sort(R &&Range)
Definition: STLExtras.h:2037
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
cl::opt< bool > ApplyExtTspWithoutProfile
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
cl::opt< unsigned > ProfileLikelyProb
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2107
cl::opt< std::string > ViewBlockFreqFuncName("view-bfi-func-name", cl::Hidden, cl::desc("The option to specify " "the name of the function " "whose CFG will be displayed."))
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
cl::opt< GVDAGType > ViewBlockLayoutWithBFI("view-block-layout-with-bfi", cl::Hidden, cl::desc("Pop up a window to show a dag displaying MBP layout and associated " "block frequencies of the CFG."), cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), clEnumValN(GVDT_Fraction, "fraction", "display a graph using the " "fractional block frequency representation."), clEnumValN(GVDT_Integer, "integer", "display a graph using the raw " "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available.")))
bool isFunctionInPrintList(StringRef FunctionName)
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
cl::opt< unsigned > StaticLikelyProb
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1938
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2099
Printable printBlockFreq(const BlockFrequencyInfo &BFI, BlockFrequency Freq)
Print the block frequency Freq relative to the current functions entry frequency.
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
cl::opt< bool > EnableExtTspBlockPlacement
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
void initializeMachineBlockPlacementPass(PassRegistry &)
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39