LLVM 20.0.0git
MachineBlockPlacement.cpp
Go to the documentation of this file.
1//===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements basic block placement transformations using the CFG
10// structure and branch probability estimates.
11//
12// The pass strives to preserve the structure of the CFG (that is, retain
13// a topological ordering of basic blocks) in the absence of a *strong* signal
14// to the contrary from probabilities. However, within the CFG structure, it
15// attempts to choose an ordering which favors placing more likely sequences of
16// blocks adjacent to each other.
17//
18// The algorithm works from the inner-most loop within a function outward, and
19// at each stage walks through the basic blocks, trying to coalesce them into
20// sequential chains where allowed by the CFG (or demanded by heavy
21// probabilities). Finally, it walks the blocks in topological order, and the
22// first time it reaches a chain of basic blocks, it schedules them in the
23// function in-order.
24//
25//===----------------------------------------------------------------------===//
26
27#include "BranchFolding.h"
28#include "llvm/ADT/ArrayRef.h"
29#include "llvm/ADT/DenseMap.h"
30#include "llvm/ADT/STLExtras.h"
31#include "llvm/ADT/SetVector.h"
34#include "llvm/ADT/Statistic.h"
51#include "llvm/IR/DebugLoc.h"
52#include "llvm/IR/Function.h"
53#include "llvm/IR/PrintPasses.h"
55#include "llvm/Pass.h"
62#include "llvm/Support/Debug.h"
66#include <algorithm>
67#include <cassert>
68#include <cstdint>
69#include <iterator>
70#include <memory>
71#include <string>
72#include <tuple>
73#include <utility>
74#include <vector>
75
76using namespace llvm;
77
78#define DEBUG_TYPE "block-placement"
79
80STATISTIC(NumCondBranches, "Number of conditional branches");
81STATISTIC(NumUncondBranches, "Number of unconditional branches");
82STATISTIC(CondBranchTakenFreq,
83 "Potential frequency of taking conditional branches");
84STATISTIC(UncondBranchTakenFreq,
85 "Potential frequency of taking unconditional branches");
86
88 "align-all-blocks",
89 cl::desc("Force the alignment of all blocks in the function in log2 format "
90 "(e.g 4 means align on 16B boundaries)."),
92
94 "align-all-nofallthru-blocks",
95 cl::desc("Force the alignment of all blocks that have no fall-through "
96 "predecessors (i.e. don't add nops that are executed). In log2 "
97 "format (e.g 4 means align on 16B boundaries)."),
99
101 "max-bytes-for-alignment",
102 cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
103 "alignment"),
104 cl::init(0), cl::Hidden);
105
106// FIXME: Find a good default for this flag and remove the flag.
108 "block-placement-exit-block-bias",
109 cl::desc("Block frequency percentage a loop exit block needs "
110 "over the original exit to be considered the new exit."),
111 cl::init(0), cl::Hidden);
112
113// Definition:
114// - Outlining: placement of a basic block outside the chain or hot path.
115
117 "loop-to-cold-block-ratio",
118 cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
119 "(frequency of block) is greater than this ratio"),
120 cl::init(5), cl::Hidden);
121
123 "force-loop-cold-block",
124 cl::desc("Force outlining cold blocks from loops."),
125 cl::init(false), cl::Hidden);
126
127static cl::opt<bool>
128 PreciseRotationCost("precise-rotation-cost",
129 cl::desc("Model the cost of loop rotation more "
130 "precisely by using profile data."),
131 cl::init(false), cl::Hidden);
132
133static cl::opt<bool>
134 ForcePreciseRotationCost("force-precise-rotation-cost",
135 cl::desc("Force the use of precise cost "
136 "loop rotation strategy."),
137 cl::init(false), cl::Hidden);
138
140 "misfetch-cost",
141 cl::desc("Cost that models the probabilistic risk of an instruction "
142 "misfetch due to a jump comparing to falling through, whose cost "
143 "is zero."),
144 cl::init(1), cl::Hidden);
145
146static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
147 cl::desc("Cost of jump instructions."),
148 cl::init(1), cl::Hidden);
149static cl::opt<bool>
150TailDupPlacement("tail-dup-placement",
151 cl::desc("Perform tail duplication during placement. "
152 "Creates more fallthrough opportunites in "
153 "outline branches."),
154 cl::init(true), cl::Hidden);
155
156static cl::opt<bool>
157BranchFoldPlacement("branch-fold-placement",
158 cl::desc("Perform branch folding during placement. "
159 "Reduces code size."),
160 cl::init(true), cl::Hidden);
161
162// Heuristic for tail duplication.
164 "tail-dup-placement-threshold",
165 cl::desc("Instruction cutoff for tail duplication during layout. "
166 "Tail merging during layout is forced to have a threshold "
167 "that won't conflict."), cl::init(2),
168 cl::Hidden);
169
170// Heuristic for aggressive tail duplication.
172 "tail-dup-placement-aggressive-threshold",
173 cl::desc("Instruction cutoff for aggressive tail duplication during "
174 "layout. Used at -O3. Tail merging during layout is forced to "
175 "have a threshold that won't conflict."), cl::init(4),
176 cl::Hidden);
177
178// Heuristic for tail duplication.
180 "tail-dup-placement-penalty",
181 cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
182 "Copying can increase fallthrough, but it also increases icache "
183 "pressure. This parameter controls the penalty to account for that. "
184 "Percent as integer."),
185 cl::init(2),
186 cl::Hidden);
187
188// Heuristic for tail duplication if profile count is used in cost model.
190 "tail-dup-profile-percent-threshold",
191 cl::desc("If profile count information is used in tail duplication cost "
192 "model, the gained fall through number from tail duplication "
193 "should be at least this percent of hot count."),
194 cl::init(50), cl::Hidden);
195
196// Heuristic for triangle chains.
198 "triangle-chain-count",
199 cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
200 "triangle tail duplication heuristic to kick in. 0 to disable."),
201 cl::init(2),
202 cl::Hidden);
203
204// Use case: When block layout is visualized after MBP pass, the basic blocks
205// are labeled in layout order; meanwhile blocks could be numbered in a
206// different order. It's hard to map between the graph and pass output.
207// With this option on, the basic blocks are renumbered in function layout
208// order. For debugging only.
210 "renumber-blocks-before-view",
211 cl::desc(
212 "If true, basic blocks are re-numbered before MBP layout is printed "
213 "into a dot graph. Only used when a function is being printed."),
214 cl::init(false), cl::Hidden);
215
217 "ext-tsp-block-placement-max-blocks",
218 cl::desc("Maximum number of basic blocks in a function to run ext-TSP "
219 "block placement."),
220 cl::init(UINT_MAX), cl::Hidden);
221
222namespace llvm {
227
228// Internal option used to control BFI display only after MBP pass.
229// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
230// -view-block-layout-with-bfi=
232
233// Command line option to specify the name of the function for CFG dump
234// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
236} // namespace llvm
237
238namespace {
239
240class BlockChain;
241
242/// Type for our function-wide basic block -> block chain mapping.
243using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
244
245/// A chain of blocks which will be laid out contiguously.
246///
247/// This is the datastructure representing a chain of consecutive blocks that
248/// are profitable to layout together in order to maximize fallthrough
249/// probabilities and code locality. We also can use a block chain to represent
250/// a sequence of basic blocks which have some external (correctness)
251/// requirement for sequential layout.
252///
253/// Chains can be built around a single basic block and can be merged to grow
254/// them. They participate in a block-to-chain mapping, which is updated
255/// automatically as chains are merged together.
256class BlockChain {
257 /// The sequence of blocks belonging to this chain.
258 ///
259 /// This is the sequence of blocks for a particular chain. These will be laid
260 /// out in-order within the function.
262
263 /// A handle to the function-wide basic block to block chain mapping.
264 ///
265 /// This is retained in each block chain to simplify the computation of child
266 /// block chains for SCC-formation and iteration. We store the edges to child
267 /// basic blocks, and map them back to their associated chains using this
268 /// structure.
269 BlockToChainMapType &BlockToChain;
270
271public:
272 /// Construct a new BlockChain.
273 ///
274 /// This builds a new block chain representing a single basic block in the
275 /// function. It also registers itself as the chain that block participates
276 /// in with the BlockToChain mapping.
277 BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
278 : Blocks(1, BB), BlockToChain(BlockToChain) {
279 assert(BB && "Cannot create a chain with a null basic block");
280 BlockToChain[BB] = this;
281 }
282
283 /// Iterator over blocks within the chain.
286
287 /// Beginning of blocks within the chain.
288 iterator begin() { return Blocks.begin(); }
289 const_iterator begin() const { return Blocks.begin(); }
290
291 /// End of blocks within the chain.
292 iterator end() { return Blocks.end(); }
293 const_iterator end() const { return Blocks.end(); }
294
295 bool remove(MachineBasicBlock* BB) {
296 for(iterator i = begin(); i != end(); ++i) {
297 if (*i == BB) {
298 Blocks.erase(i);
299 return true;
300 }
301 }
302 return false;
303 }
304
305 /// Merge a block chain into this one.
306 ///
307 /// This routine merges a block chain into this one. It takes care of forming
308 /// a contiguous sequence of basic blocks, updating the edge list, and
309 /// updating the block -> chain mapping. It does not free or tear down the
310 /// old chain, but the old chain's block list is no longer valid.
311 void merge(MachineBasicBlock *BB, BlockChain *Chain) {
312 assert(BB && "Can't merge a null block.");
313 assert(!Blocks.empty() && "Can't merge into an empty chain.");
314
315 // Fast path in case we don't have a chain already.
316 if (!Chain) {
317 assert(!BlockToChain[BB] &&
318 "Passed chain is null, but BB has entry in BlockToChain.");
319 Blocks.push_back(BB);
320 BlockToChain[BB] = this;
321 return;
322 }
323
324 assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
325 assert(Chain->begin() != Chain->end());
326
327 // Update the incoming blocks to point to this chain, and add them to the
328 // chain structure.
329 for (MachineBasicBlock *ChainBB : *Chain) {
330 Blocks.push_back(ChainBB);
331 assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
332 BlockToChain[ChainBB] = this;
333 }
334 }
335
336#ifndef NDEBUG
337 /// Dump the blocks in this chain.
338 LLVM_DUMP_METHOD void dump() {
339 for (MachineBasicBlock *MBB : *this)
340 MBB->dump();
341 }
342#endif // NDEBUG
343
344 /// Count of predecessors of any block within the chain which have not
345 /// yet been scheduled. In general, we will delay scheduling this chain
346 /// until those predecessors are scheduled (or we find a sufficiently good
347 /// reason to override this heuristic.) Note that when forming loop chains,
348 /// blocks outside the loop are ignored and treated as if they were already
349 /// scheduled.
350 ///
351 /// Note: This field is reinitialized multiple times - once for each loop,
352 /// and then once for the function as a whole.
353 unsigned UnscheduledPredecessors = 0;
354};
355
356class MachineBlockPlacement : public MachineFunctionPass {
357 /// A type for a block filter set.
359
360 /// Pair struct containing basic block and taildup profitability
361 struct BlockAndTailDupResult {
362 MachineBasicBlock *BB = nullptr;
363 bool ShouldTailDup;
364 };
365
366 /// Triple struct containing edge weight and the edge.
367 struct WeightedEdge {
368 BlockFrequency Weight;
369 MachineBasicBlock *Src = nullptr;
370 MachineBasicBlock *Dest = nullptr;
371 };
372
373 /// work lists of blocks that are ready to be laid out
376
377 /// Edges that have already been computed as optimal.
379
380 /// Machine Function
381 MachineFunction *F = nullptr;
382
383 /// A handle to the branch probability pass.
384 const MachineBranchProbabilityInfo *MBPI = nullptr;
385
386 /// A handle to the function-wide block frequency pass.
387 std::unique_ptr<MBFIWrapper> MBFI;
388
389 /// A handle to the loop info.
390 MachineLoopInfo *MLI = nullptr;
391
392 /// Preferred loop exit.
393 /// Member variable for convenience. It may be removed by duplication deep
394 /// in the call stack.
395 MachineBasicBlock *PreferredLoopExit = nullptr;
396
397 /// A handle to the target's instruction info.
398 const TargetInstrInfo *TII = nullptr;
399
400 /// A handle to the target's lowering info.
401 const TargetLoweringBase *TLI = nullptr;
402
403 /// A handle to the post dominator tree.
404 MachinePostDominatorTree *MPDT = nullptr;
405
406 ProfileSummaryInfo *PSI = nullptr;
407
408 /// Duplicator used to duplicate tails during placement.
409 ///
410 /// Placement decisions can open up new tail duplication opportunities, but
411 /// since tail duplication affects placement decisions of later blocks, it
412 /// must be done inline.
413 TailDuplicator TailDup;
414
415 /// Partial tail duplication threshold.
416 BlockFrequency DupThreshold;
417
418 /// True: use block profile count to compute tail duplication cost.
419 /// False: use block frequency to compute tail duplication cost.
420 bool UseProfileCount = false;
421
422 /// Allocator and owner of BlockChain structures.
423 ///
424 /// We build BlockChains lazily while processing the loop structure of
425 /// a function. To reduce malloc traffic, we allocate them using this
426 /// slab-like allocator, and destroy them after the pass completes. An
427 /// important guarantee is that this allocator produces stable pointers to
428 /// the chains.
430
431 /// Function wide BasicBlock to BlockChain mapping.
432 ///
433 /// This mapping allows efficiently moving from any given basic block to the
434 /// BlockChain it participates in, if any. We use it to, among other things,
435 /// allow implicitly defining edges between chains as the existing edges
436 /// between basic blocks.
438
439#ifndef NDEBUG
440 /// The set of basic blocks that have terminators that cannot be fully
441 /// analyzed. These basic blocks cannot be re-ordered safely by
442 /// MachineBlockPlacement, and we must preserve physical layout of these
443 /// blocks and their successors through the pass.
444 SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
445#endif
446
447 /// Get block profile count or frequency according to UseProfileCount.
448 /// The return value is used to model tail duplication cost.
449 BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
450 if (UseProfileCount) {
451 auto Count = MBFI->getBlockProfileCount(BB);
452 if (Count)
453 return BlockFrequency(*Count);
454 else
455 return BlockFrequency(0);
456 } else
457 return MBFI->getBlockFreq(BB);
458 }
459
460 /// Scale the DupThreshold according to basic block size.
461 BlockFrequency scaleThreshold(MachineBasicBlock *BB);
462 void initDupThreshold();
463
464 /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
465 /// if the count goes to 0, add them to the appropriate work list.
466 void markChainSuccessors(
467 const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
468 const BlockFilterSet *BlockFilter = nullptr);
469
470 /// Decrease the UnscheduledPredecessors count for a single block, and
471 /// if the count goes to 0, add them to the appropriate work list.
472 void markBlockSuccessors(
473 const BlockChain &Chain, const MachineBasicBlock *BB,
474 const MachineBasicBlock *LoopHeaderBB,
475 const BlockFilterSet *BlockFilter = nullptr);
476
478 collectViableSuccessors(
479 const MachineBasicBlock *BB, const BlockChain &Chain,
480 const BlockFilterSet *BlockFilter,
482 bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
483 BlockFilterSet *BlockFilter);
484 void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
486 BlockFilterSet *BlockFilter);
487 bool repeatedlyTailDuplicateBlock(
489 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
490 BlockFilterSet *BlockFilter,
491 MachineFunction::iterator &PrevUnplacedBlockIt,
492 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt);
493 bool
494 maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred,
495 BlockChain &Chain, BlockFilterSet *BlockFilter,
496 MachineFunction::iterator &PrevUnplacedBlockIt,
497 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
498 bool &DuplicatedToLPred);
499 bool hasBetterLayoutPredecessor(
500 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
501 const BlockChain &SuccChain, BranchProbability SuccProb,
502 BranchProbability RealSuccProb, const BlockChain &Chain,
503 const BlockFilterSet *BlockFilter);
504 BlockAndTailDupResult selectBestSuccessor(
505 const MachineBasicBlock *BB, const BlockChain &Chain,
506 const BlockFilterSet *BlockFilter);
507 MachineBasicBlock *selectBestCandidateBlock(
508 const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
510 getFirstUnplacedBlock(const BlockChain &PlacedChain,
511 MachineFunction::iterator &PrevUnplacedBlockIt);
513 getFirstUnplacedBlock(const BlockChain &PlacedChain,
514 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
515 const BlockFilterSet *BlockFilter);
516
517 /// Add a basic block to the work list if it is appropriate.
518 ///
519 /// If the optional parameter BlockFilter is provided, only MBB
520 /// present in the set will be added to the worklist. If nullptr
521 /// is provided, no filtering occurs.
522 void fillWorkLists(const MachineBasicBlock *MBB,
523 SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
524 const BlockFilterSet *BlockFilter);
525
526 void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
527 BlockFilterSet *BlockFilter = nullptr);
528 bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
529 const MachineBasicBlock *OldTop);
530 bool hasViableTopFallthrough(const MachineBasicBlock *Top,
531 const BlockFilterSet &LoopBlockSet);
532 BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
533 const BlockFilterSet &LoopBlockSet);
534 BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
535 const MachineBasicBlock *OldTop,
536 const MachineBasicBlock *ExitBB,
537 const BlockFilterSet &LoopBlockSet);
538 MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
539 const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
540 MachineBasicBlock *findBestLoopTop(
541 const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
542 MachineBasicBlock *findBestLoopExit(
543 const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
544 BlockFrequency &ExitFreq);
545 BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
546 void buildLoopChains(const MachineLoop &L);
547 void rotateLoop(
548 BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
549 BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
550 void rotateLoopWithProfile(
551 BlockChain &LoopChain, const MachineLoop &L,
552 const BlockFilterSet &LoopBlockSet);
553 void buildCFGChains();
554 void optimizeBranches();
555 void alignBlocks();
556 /// Returns true if a block should be tail-duplicated to increase fallthrough
557 /// opportunities.
558 bool shouldTailDuplicate(MachineBasicBlock *BB);
559 /// Check the edge frequencies to see if tail duplication will increase
560 /// fallthroughs.
561 bool isProfitableToTailDup(
562 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
563 BranchProbability QProb,
564 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
565
566 /// Check for a trellis layout.
567 bool isTrellis(const MachineBasicBlock *BB,
568 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
569 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
570
571 /// Get the best successor given a trellis layout.
572 BlockAndTailDupResult getBestTrellisSuccessor(
573 const MachineBasicBlock *BB,
574 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
575 BranchProbability AdjustedSumProb, const BlockChain &Chain,
576 const BlockFilterSet *BlockFilter);
577
578 /// Get the best pair of non-conflicting edges.
579 static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
580 const MachineBasicBlock *BB,
582
583 /// Returns true if a block can tail duplicate into all unplaced
584 /// predecessors. Filters based on loop.
585 bool canTailDuplicateUnplacedPreds(
586 const MachineBasicBlock *BB, MachineBasicBlock *Succ,
587 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
588
589 /// Find chains of triangles to tail-duplicate where a global analysis works,
590 /// but a local analysis would not find them.
591 void precomputeTriangleChains();
592
593 /// Apply a post-processing step optimizing block placement.
594 void applyExtTsp();
595
596 /// Modify the existing block placement in the function and adjust all jumps.
597 void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
598
599 /// Create a single CFG chain from the current block order.
600 void createCFGChainExtTsp();
601
602public:
603 static char ID; // Pass identification, replacement for typeid
604
605 MachineBlockPlacement() : MachineFunctionPass(ID) {
607 }
608
609 bool runOnMachineFunction(MachineFunction &F) override;
610
611 bool allowTailDupPlacement() const {
612 assert(F);
613 return TailDupPlacement && !F->getTarget().requiresStructuredCFG();
614 }
615
616 void getAnalysisUsage(AnalysisUsage &AU) const override {
625 }
626};
627
628} // end anonymous namespace
629
630char MachineBlockPlacement::ID = 0;
631
632char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID;
633
634INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
635 "Branch Probability Basic Block Placement", false, false)
641INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
642 "Branch Probability Basic Block Placement", false, false)
643
644#ifndef NDEBUG
645/// Helper to print the name of a MBB.
646///
647/// Only used by debug logging.
648static std::string getBlockName(const MachineBasicBlock *BB) {
649 std::string Result;
650 raw_string_ostream OS(Result);
651 OS << printMBBReference(*BB);
652 OS << " ('" << BB->getName() << "')";
653 OS.flush();
654 return Result;
655}
656#endif
657
658/// Mark a chain's successors as having one fewer preds.
659///
660/// When a chain is being merged into the "placed" chain, this routine will
661/// quickly walk the successors of each block in the chain and mark them as
662/// having one fewer active predecessor. It also adds any successors of this
663/// chain which reach the zero-predecessor state to the appropriate worklist.
664void MachineBlockPlacement::markChainSuccessors(
665 const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
666 const BlockFilterSet *BlockFilter) {
667 // Walk all the blocks in this chain, marking their successors as having
668 // a predecessor placed.
669 for (MachineBasicBlock *MBB : Chain) {
670 markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
671 }
672}
673
674/// Mark a single block's successors as having one fewer preds.
675///
676/// Under normal circumstances, this is only called by markChainSuccessors,
677/// but if a block that was to be placed is completely tail-duplicated away,
678/// and was duplicated into the chain end, we need to redo markBlockSuccessors
679/// for just that block.
680void MachineBlockPlacement::markBlockSuccessors(
681 const BlockChain &Chain, const MachineBasicBlock *MBB,
682 const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
683 // Add any successors for which this is the only un-placed in-loop
684 // predecessor to the worklist as a viable candidate for CFG-neutral
685 // placement. No subsequent placement of this block will violate the CFG
686 // shape, so we get to use heuristics to choose a favorable placement.
687 for (MachineBasicBlock *Succ : MBB->successors()) {
688 if (BlockFilter && !BlockFilter->count(Succ))
689 continue;
690 BlockChain &SuccChain = *BlockToChain[Succ];
691 // Disregard edges within a fixed chain, or edges to the loop header.
692 if (&Chain == &SuccChain || Succ == LoopHeaderBB)
693 continue;
694
695 // This is a cross-chain edge that is within the loop, so decrement the
696 // loop predecessor count of the destination chain.
697 if (SuccChain.UnscheduledPredecessors == 0 ||
698 --SuccChain.UnscheduledPredecessors > 0)
699 continue;
700
701 auto *NewBB = *SuccChain.begin();
702 if (NewBB->isEHPad())
703 EHPadWorkList.push_back(NewBB);
704 else
705 BlockWorkList.push_back(NewBB);
706 }
707}
708
709/// This helper function collects the set of successors of block
710/// \p BB that are allowed to be its layout successors, and return
711/// the total branch probability of edges from \p BB to those
712/// blocks.
713BranchProbability MachineBlockPlacement::collectViableSuccessors(
714 const MachineBasicBlock *BB, const BlockChain &Chain,
715 const BlockFilterSet *BlockFilter,
717 // Adjust edge probabilities by excluding edges pointing to blocks that is
718 // either not in BlockFilter or is already in the current chain. Consider the
719 // following CFG:
720 //
721 // --->A
722 // | / \
723 // | B C
724 // | \ / \
725 // ----D E
726 //
727 // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
728 // A->C is chosen as a fall-through, D won't be selected as a successor of C
729 // due to CFG constraint (the probability of C->D is not greater than
730 // HotProb to break topo-order). If we exclude E that is not in BlockFilter
731 // when calculating the probability of C->D, D will be selected and we
732 // will get A C D B as the layout of this loop.
733 auto AdjustedSumProb = BranchProbability::getOne();
734 for (MachineBasicBlock *Succ : BB->successors()) {
735 bool SkipSucc = false;
736 if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
737 SkipSucc = true;
738 } else {
739 BlockChain *SuccChain = BlockToChain[Succ];
740 if (SuccChain == &Chain) {
741 SkipSucc = true;
742 } else if (Succ != *SuccChain->begin()) {
743 LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
744 << " -> Mid chain!\n");
745 continue;
746 }
747 }
748 if (SkipSucc)
749 AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
750 else
751 Successors.push_back(Succ);
752 }
753
754 return AdjustedSumProb;
755}
756
757/// The helper function returns the branch probability that is adjusted
758/// or normalized over the new total \p AdjustedSumProb.
761 BranchProbability AdjustedSumProb) {
762 BranchProbability SuccProb;
763 uint32_t SuccProbN = OrigProb.getNumerator();
764 uint32_t SuccProbD = AdjustedSumProb.getNumerator();
765 if (SuccProbN >= SuccProbD)
766 SuccProb = BranchProbability::getOne();
767 else
768 SuccProb = BranchProbability(SuccProbN, SuccProbD);
769
770 return SuccProb;
771}
772
773/// Check if \p BB has exactly the successors in \p Successors.
774static bool
777 if (BB.succ_size() != Successors.size())
778 return false;
779 // We don't want to count self-loops
780 if (Successors.count(&BB))
781 return false;
782 for (MachineBasicBlock *Succ : BB.successors())
783 if (!Successors.count(Succ))
784 return false;
785 return true;
786}
787
788/// Check if a block should be tail duplicated to increase fallthrough
789/// opportunities.
790/// \p BB Block to check.
791bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
792 // Blocks with single successors don't create additional fallthrough
793 // opportunities. Don't duplicate them. TODO: When conditional exits are
794 // analyzable, allow them to be duplicated.
795 bool IsSimple = TailDup.isSimpleBB(BB);
796
797 if (BB->succ_size() == 1)
798 return false;
799 return TailDup.shouldTailDuplicate(IsSimple, *BB);
800}
801
802/// Compare 2 BlockFrequency's with a small penalty for \p A.
803/// In order to be conservative, we apply a X% penalty to account for
804/// increased icache pressure and static heuristics. For small frequencies
805/// we use only the numerators to improve accuracy. For simplicity, we assume the
806/// penalty is less than 100%
807/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
809 BlockFrequency EntryFreq) {
810 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
811 BlockFrequency Gain = A - B;
812 return (Gain / ThresholdProb) >= EntryFreq;
813}
814
815/// Check the edge frequencies to see if tail duplication will increase
816/// fallthroughs. It only makes sense to call this function when
817/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
818/// always locally profitable if we would have picked \p Succ without
819/// considering duplication.
820bool MachineBlockPlacement::isProfitableToTailDup(
821 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
822 BranchProbability QProb,
823 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
824 // We need to do a probability calculation to make sure this is profitable.
825 // First: does succ have a successor that post-dominates? This affects the
826 // calculation. The 2 relevant cases are:
827 // BB BB
828 // | \Qout | \Qout
829 // P| C |P C
830 // = C' = C'
831 // | /Qin | /Qin
832 // | / | /
833 // Succ Succ
834 // / \ | \ V
835 // U/ =V |U \
836 // / \ = D
837 // D E | /
838 // | /
839 // |/
840 // PDom
841 // '=' : Branch taken for that CFG edge
842 // In the second case, Placing Succ while duplicating it into C prevents the
843 // fallthrough of Succ into either D or PDom, because they now have C as an
844 // unplaced predecessor
845
846 // Start by figuring out which case we fall into
847 MachineBasicBlock *PDom = nullptr;
849 // Only scan the relevant successors
850 auto AdjustedSuccSumProb =
851 collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
852 BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
853 auto BBFreq = MBFI->getBlockFreq(BB);
854 auto SuccFreq = MBFI->getBlockFreq(Succ);
855 BlockFrequency P = BBFreq * PProb;
856 BlockFrequency Qout = BBFreq * QProb;
857 BlockFrequency EntryFreq = MBFI->getEntryFreq();
858 // If there are no more successors, it is profitable to copy, as it strictly
859 // increases fallthrough.
860 if (SuccSuccs.size() == 0)
861 return greaterWithBias(P, Qout, EntryFreq);
862
863 auto BestSuccSucc = BranchProbability::getZero();
864 // Find the PDom or the best Succ if no PDom exists.
865 for (MachineBasicBlock *SuccSucc : SuccSuccs) {
866 auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
867 if (Prob > BestSuccSucc)
868 BestSuccSucc = Prob;
869 if (PDom == nullptr)
870 if (MPDT->dominates(SuccSucc, Succ)) {
871 PDom = SuccSucc;
872 break;
873 }
874 }
875 // For the comparisons, we need to know Succ's best incoming edge that isn't
876 // from BB.
877 auto SuccBestPred = BlockFrequency(0);
878 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
879 if (SuccPred == Succ || SuccPred == BB
880 || BlockToChain[SuccPred] == &Chain
881 || (BlockFilter && !BlockFilter->count(SuccPred)))
882 continue;
883 auto Freq = MBFI->getBlockFreq(SuccPred)
884 * MBPI->getEdgeProbability(SuccPred, Succ);
885 if (Freq > SuccBestPred)
886 SuccBestPred = Freq;
887 }
888 // Qin is Succ's best unplaced incoming edge that isn't BB
889 BlockFrequency Qin = SuccBestPred;
890 // If it doesn't have a post-dominating successor, here is the calculation:
891 // BB BB
892 // | \Qout | \
893 // P| C | =
894 // = C' | C
895 // | /Qin | |
896 // | / | C' (+Succ)
897 // Succ Succ /|
898 // / \ | \/ |
899 // U/ =V | == |
900 // / \ | / \|
901 // D E D E
902 // '=' : Branch taken for that CFG edge
903 // Cost in the first case is: P + V
904 // For this calculation, we always assume P > Qout. If Qout > P
905 // The result of this function will be ignored at the caller.
906 // Let F = SuccFreq - Qin
907 // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
908
909 if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
910 BranchProbability UProb = BestSuccSucc;
911 BranchProbability VProb = AdjustedSuccSumProb - UProb;
912 BlockFrequency F = SuccFreq - Qin;
913 BlockFrequency V = SuccFreq * VProb;
914 BlockFrequency QinU = std::min(Qin, F) * UProb;
915 BlockFrequency BaseCost = P + V;
916 BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
917 return greaterWithBias(BaseCost, DupCost, EntryFreq);
918 }
919 BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
920 BranchProbability VProb = AdjustedSuccSumProb - UProb;
921 BlockFrequency U = SuccFreq * UProb;
922 BlockFrequency V = SuccFreq * VProb;
923 BlockFrequency F = SuccFreq - Qin;
924 // If there is a post-dominating successor, here is the calculation:
925 // BB BB BB BB
926 // | \Qout | \ | \Qout | \
927 // |P C | = |P C | =
928 // = C' |P C = C' |P C
929 // | /Qin | | | /Qin | |
930 // | / | C' (+Succ) | / | C' (+Succ)
931 // Succ Succ /| Succ Succ /|
932 // | \ V | \/ | | \ V | \/ |
933 // |U \ |U /\ =? |U = |U /\ |
934 // = D = = =?| | D | = =|
935 // | / |/ D | / |/ D
936 // | / | / | = | /
937 // |/ | / |/ | =
938 // Dom Dom Dom Dom
939 // '=' : Branch taken for that CFG edge
940 // The cost for taken branches in the first case is P + U
941 // Let F = SuccFreq - Qin
942 // The cost in the second case (assuming independence), given the layout:
943 // BB, Succ, (C+Succ), D, Dom or the layout:
944 // BB, Succ, D, Dom, (C+Succ)
945 // is Qout + max(F, Qin) * U + min(F, Qin)
946 // compare P + U vs Qout + P * U + Qin.
947 //
948 // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
949 //
950 // For the 3rd case, the cost is P + 2 * V
951 // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
952 // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
953 if (UProb > AdjustedSuccSumProb / 2 &&
954 !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
955 Chain, BlockFilter))
956 // Cases 3 & 4
957 return greaterWithBias(
958 (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
959 EntryFreq);
960 // Cases 1 & 2
961 return greaterWithBias((P + U),
962 (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
963 std::max(Qin, F) * UProb),
964 EntryFreq);
965}
966
967/// Check for a trellis layout. \p BB is the upper part of a trellis if its
968/// successors form the lower part of a trellis. A successor set S forms the
969/// lower part of a trellis if all of the predecessors of S are either in S or
970/// have all of S as successors. We ignore trellises where BB doesn't have 2
971/// successors because for fewer than 2, it's trivial, and for 3 or greater they
972/// are very uncommon and complex to compute optimally. Allowing edges within S
973/// is not strictly a trellis, but the same algorithm works, so we allow it.
974bool MachineBlockPlacement::isTrellis(
975 const MachineBasicBlock *BB,
976 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
977 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
978 // Technically BB could form a trellis with branching factor higher than 2.
979 // But that's extremely uncommon.
980 if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
981 return false;
982
984 BB->succ_end());
985 // To avoid reviewing the same predecessors twice.
987
988 for (MachineBasicBlock *Succ : ViableSuccs) {
989 int PredCount = 0;
990 for (auto *SuccPred : Succ->predecessors()) {
991 // Allow triangle successors, but don't count them.
992 if (Successors.count(SuccPred)) {
993 // Make sure that it is actually a triangle.
994 for (MachineBasicBlock *CheckSucc : SuccPred->successors())
995 if (!Successors.count(CheckSucc))
996 return false;
997 continue;
998 }
999 const BlockChain *PredChain = BlockToChain[SuccPred];
1000 if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
1001 PredChain == &Chain || PredChain == BlockToChain[Succ])
1002 continue;
1003 ++PredCount;
1004 // Perform the successor check only once.
1005 if (!SeenPreds.insert(SuccPred).second)
1006 continue;
1007 if (!hasSameSuccessors(*SuccPred, Successors))
1008 return false;
1009 }
1010 // If one of the successors has only BB as a predecessor, it is not a
1011 // trellis.
1012 if (PredCount < 1)
1013 return false;
1014 }
1015 return true;
1016}
1017
1018/// Pick the highest total weight pair of edges that can both be laid out.
1019/// The edges in \p Edges[0] are assumed to have a different destination than
1020/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
1021/// the individual highest weight edges to the 2 different destinations, or in
1022/// case of a conflict, one of them should be replaced with a 2nd best edge.
1023std::pair<MachineBlockPlacement::WeightedEdge,
1024 MachineBlockPlacement::WeightedEdge>
1025MachineBlockPlacement::getBestNonConflictingEdges(
1026 const MachineBasicBlock *BB,
1028 Edges) {
1029 // Sort the edges, and then for each successor, find the best incoming
1030 // predecessor. If the best incoming predecessors aren't the same,
1031 // then that is clearly the best layout. If there is a conflict, one of the
1032 // successors will have to fallthrough from the second best predecessor. We
1033 // compare which combination is better overall.
1034
1035 // Sort for highest frequency.
1036 auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
1037
1038 llvm::stable_sort(Edges[0], Cmp);
1039 llvm::stable_sort(Edges[1], Cmp);
1040 auto BestA = Edges[0].begin();
1041 auto BestB = Edges[1].begin();
1042 // Arrange for the correct answer to be in BestA and BestB
1043 // If the 2 best edges don't conflict, the answer is already there.
1044 if (BestA->Src == BestB->Src) {
1045 // Compare the total fallthrough of (Best + Second Best) for both pairs
1046 auto SecondBestA = std::next(BestA);
1047 auto SecondBestB = std::next(BestB);
1048 BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
1049 BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
1050 if (BestAScore < BestBScore)
1051 BestA = SecondBestA;
1052 else
1053 BestB = SecondBestB;
1054 }
1055 // Arrange for the BB edge to be in BestA if it exists.
1056 if (BestB->Src == BB)
1057 std::swap(BestA, BestB);
1058 return std::make_pair(*BestA, *BestB);
1059}
1060
1061/// Get the best successor from \p BB based on \p BB being part of a trellis.
1062/// We only handle trellises with 2 successors, so the algorithm is
1063/// straightforward: Find the best pair of edges that don't conflict. We find
1064/// the best incoming edge for each successor in the trellis. If those conflict,
1065/// we consider which of them should be replaced with the second best.
1066/// Upon return the two best edges will be in \p BestEdges. If one of the edges
1067/// comes from \p BB, it will be in \p BestEdges[0]
1068MachineBlockPlacement::BlockAndTailDupResult
1069MachineBlockPlacement::getBestTrellisSuccessor(
1070 const MachineBasicBlock *BB,
1071 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1072 BranchProbability AdjustedSumProb, const BlockChain &Chain,
1073 const BlockFilterSet *BlockFilter) {
1074
1075 BlockAndTailDupResult Result = {nullptr, false};
1077 BB->succ_end());
1078
1079 // We assume size 2 because it's common. For general n, we would have to do
1080 // the Hungarian algorithm, but it's not worth the complexity because more
1081 // than 2 successors is fairly uncommon, and a trellis even more so.
1082 if (Successors.size() != 2 || ViableSuccs.size() != 2)
1083 return Result;
1084
1085 // Collect the edge frequencies of all edges that form the trellis.
1087 int SuccIndex = 0;
1088 for (auto *Succ : ViableSuccs) {
1089 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1090 // Skip any placed predecessors that are not BB
1091 if (SuccPred != BB)
1092 if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
1093 BlockToChain[SuccPred] == &Chain ||
1094 BlockToChain[SuccPred] == BlockToChain[Succ])
1095 continue;
1096 BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1097 MBPI->getEdgeProbability(SuccPred, Succ);
1098 Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1099 }
1100 ++SuccIndex;
1101 }
1102
1103 // Pick the best combination of 2 edges from all the edges in the trellis.
1104 WeightedEdge BestA, BestB;
1105 std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1106
1107 if (BestA.Src != BB) {
1108 // If we have a trellis, and BB doesn't have the best fallthrough edges,
1109 // we shouldn't choose any successor. We've already looked and there's a
1110 // better fallthrough edge for all the successors.
1111 LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1112 return Result;
1113 }
1114
1115 // Did we pick the triangle edge? If tail-duplication is profitable, do
1116 // that instead. Otherwise merge the triangle edge now while we know it is
1117 // optimal.
1118 if (BestA.Dest == BestB.Src) {
1119 // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1120 // would be better.
1121 MachineBasicBlock *Succ1 = BestA.Dest;
1122 MachineBasicBlock *Succ2 = BestB.Dest;
1123 // Check to see if tail-duplication would be profitable.
1124 if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
1125 canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1126 isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1127 Chain, BlockFilter)) {
1129 MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1130 dbgs() << " Selected: " << getBlockName(Succ2)
1131 << ", probability: " << Succ2Prob
1132 << " (Tail Duplicate)\n");
1133 Result.BB = Succ2;
1134 Result.ShouldTailDup = true;
1135 return Result;
1136 }
1137 }
1138 // We have already computed the optimal edge for the other side of the
1139 // trellis.
1140 ComputedEdges[BestB.Src] = { BestB.Dest, false };
1141
1142 auto TrellisSucc = BestA.Dest;
1144 MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1145 dbgs() << " Selected: " << getBlockName(TrellisSucc)
1146 << ", probability: " << SuccProb << " (Trellis)\n");
1147 Result.BB = TrellisSucc;
1148 return Result;
1149}
1150
1151/// When the option allowTailDupPlacement() is on, this method checks if the
1152/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1153/// into all of its unplaced, unfiltered predecessors, that are not BB.
1154bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1155 const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1156 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1157 if (!shouldTailDuplicate(Succ))
1158 return false;
1159
1160 // The result of canTailDuplicate.
1161 bool Duplicate = true;
1162 // Number of possible duplication.
1163 unsigned int NumDup = 0;
1164
1165 // For CFG checking.
1167 BB->succ_end());
1168 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1169 // Make sure all unplaced and unfiltered predecessors can be
1170 // tail-duplicated into.
1171 // Skip any blocks that are already placed or not in this loop.
1172 if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
1173 || (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
1174 continue;
1175 if (!TailDup.canTailDuplicate(Succ, Pred)) {
1176 if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1177 // This will result in a trellis after tail duplication, so we don't
1178 // need to copy Succ into this predecessor. In the presence
1179 // of a trellis tail duplication can continue to be profitable.
1180 // For example:
1181 // A A
1182 // |\ |\
1183 // | \ | \
1184 // | C | C+BB
1185 // | / | |
1186 // |/ | |
1187 // BB => BB |
1188 // |\ |\/|
1189 // | \ |/\|
1190 // | D | D
1191 // | / | /
1192 // |/ |/
1193 // Succ Succ
1194 //
1195 // After BB was duplicated into C, the layout looks like the one on the
1196 // right. BB and C now have the same successors. When considering
1197 // whether Succ can be duplicated into all its unplaced predecessors, we
1198 // ignore C.
1199 // We can do this because C already has a profitable fallthrough, namely
1200 // D. TODO(iteratee): ignore sufficiently cold predecessors for
1201 // duplication and for this test.
1202 //
1203 // This allows trellises to be laid out in 2 separate chains
1204 // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1205 // because it allows the creation of 2 fallthrough paths with links
1206 // between them, and we correctly identify the best layout for these
1207 // CFGs. We want to extend trellises that the user created in addition
1208 // to trellises created by tail-duplication, so we just look for the
1209 // CFG.
1210 continue;
1211 Duplicate = false;
1212 continue;
1213 }
1214 NumDup++;
1215 }
1216
1217 // No possible duplication in current filter set.
1218 if (NumDup == 0)
1219 return false;
1220
1221 // If profile information is available, findDuplicateCandidates can do more
1222 // precise benefit analysis.
1223 if (F->getFunction().hasProfileData())
1224 return true;
1225
1226 // This is mainly for function exit BB.
1227 // The integrated tail duplication is really designed for increasing
1228 // fallthrough from predecessors from Succ to its successors. We may need
1229 // other machanism to handle different cases.
1230 if (Succ->succ_empty())
1231 return true;
1232
1233 // Plus the already placed predecessor.
1234 NumDup++;
1235
1236 // If the duplication candidate has more unplaced predecessors than
1237 // successors, the extra duplication can't bring more fallthrough.
1238 //
1239 // Pred1 Pred2 Pred3
1240 // \ | /
1241 // \ | /
1242 // \ | /
1243 // Dup
1244 // / \
1245 // / \
1246 // Succ1 Succ2
1247 //
1248 // In this example Dup has 2 successors and 3 predecessors, duplication of Dup
1249 // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
1250 // but the duplication into Pred3 can't increase fallthrough.
1251 //
1252 // A small number of extra duplication may not hurt too much. We need a better
1253 // heuristic to handle it.
1254 if ((NumDup > Succ->succ_size()) || !Duplicate)
1255 return false;
1256
1257 return true;
1258}
1259
1260/// Find chains of triangles where we believe it would be profitable to
1261/// tail-duplicate them all, but a local analysis would not find them.
1262/// There are 3 ways this can be profitable:
1263/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1264/// longer chains)
1265/// 2) The chains are statically correlated. Branch probabilities have a very
1266/// U-shaped distribution.
1267/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1268/// If the branches in a chain are likely to be from the same side of the
1269/// distribution as their predecessor, but are independent at runtime, this
1270/// transformation is profitable. (Because the cost of being wrong is a small
1271/// fixed cost, unlike the standard triangle layout where the cost of being
1272/// wrong scales with the # of triangles.)
1273/// 3) The chains are dynamically correlated. If the probability that a previous
1274/// branch was taken positively influences whether the next branch will be
1275/// taken
1276/// We believe that 2 and 3 are common enough to justify the small margin in 1.
1277void MachineBlockPlacement::precomputeTriangleChains() {
1278 struct TriangleChain {
1279 std::vector<MachineBasicBlock *> Edges;
1280
1281 TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1282 : Edges({src, dst}) {}
1283
1284 void append(MachineBasicBlock *dst) {
1285 assert(getKey()->isSuccessor(dst) &&
1286 "Attempting to append a block that is not a successor.");
1287 Edges.push_back(dst);
1288 }
1289
1290 unsigned count() const { return Edges.size() - 1; }
1291
1292 MachineBasicBlock *getKey() const {
1293 return Edges.back();
1294 }
1295 };
1296
1297 if (TriangleChainCount == 0)
1298 return;
1299
1300 LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1301 // Map from last block to the chain that contains it. This allows us to extend
1302 // chains as we find new triangles.
1304 for (MachineBasicBlock &BB : *F) {
1305 // If BB doesn't have 2 successors, it doesn't start a triangle.
1306 if (BB.succ_size() != 2)
1307 continue;
1308 MachineBasicBlock *PDom = nullptr;
1309 for (MachineBasicBlock *Succ : BB.successors()) {
1310 if (!MPDT->dominates(Succ, &BB))
1311 continue;
1312 PDom = Succ;
1313 break;
1314 }
1315 // If BB doesn't have a post-dominating successor, it doesn't form a
1316 // triangle.
1317 if (PDom == nullptr)
1318 continue;
1319 // If PDom has a hint that it is low probability, skip this triangle.
1320 if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1321 continue;
1322 // If PDom isn't eligible for duplication, this isn't the kind of triangle
1323 // we're looking for.
1324 if (!shouldTailDuplicate(PDom))
1325 continue;
1326 bool CanTailDuplicate = true;
1327 // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1328 // isn't the kind of triangle we're looking for.
1329 for (MachineBasicBlock* Pred : PDom->predecessors()) {
1330 if (Pred == &BB)
1331 continue;
1332 if (!TailDup.canTailDuplicate(PDom, Pred)) {
1333 CanTailDuplicate = false;
1334 break;
1335 }
1336 }
1337 // If we can't tail-duplicate PDom to its predecessors, then skip this
1338 // triangle.
1339 if (!CanTailDuplicate)
1340 continue;
1341
1342 // Now we have an interesting triangle. Insert it if it's not part of an
1343 // existing chain.
1344 // Note: This cannot be replaced with a call insert() or emplace() because
1345 // the find key is BB, but the insert/emplace key is PDom.
1346 auto Found = TriangleChainMap.find(&BB);
1347 // If it is, remove the chain from the map, grow it, and put it back in the
1348 // map with the end as the new key.
1349 if (Found != TriangleChainMap.end()) {
1350 TriangleChain Chain = std::move(Found->second);
1351 TriangleChainMap.erase(Found);
1352 Chain.append(PDom);
1353 TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1354 } else {
1355 auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1356 assert(InsertResult.second && "Block seen twice.");
1357 (void)InsertResult;
1358 }
1359 }
1360
1361 // Iterating over a DenseMap is safe here, because the only thing in the body
1362 // of the loop is inserting into another DenseMap (ComputedEdges).
1363 // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1364 for (auto &ChainPair : TriangleChainMap) {
1365 TriangleChain &Chain = ChainPair.second;
1366 // Benchmarking has shown that due to branch correlation duplicating 2 or
1367 // more triangles is profitable, despite the calculations assuming
1368 // independence.
1369 if (Chain.count() < TriangleChainCount)
1370 continue;
1371 MachineBasicBlock *dst = Chain.Edges.back();
1372 Chain.Edges.pop_back();
1373 for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1374 LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1375 << getBlockName(dst)
1376 << " as pre-computed based on triangles.\n");
1377
1378 auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1379 assert(InsertResult.second && "Block seen twice.");
1380 (void)InsertResult;
1381
1382 dst = src;
1383 }
1384 }
1385}
1386
1387// When profile is not present, return the StaticLikelyProb.
1388// When profile is available, we need to handle the triangle-shape CFG.
1390 const MachineBasicBlock *BB) {
1391 if (!BB->getParent()->getFunction().hasProfileData())
1393 if (BB->succ_size() == 2) {
1394 const MachineBasicBlock *Succ1 = *BB->succ_begin();
1395 const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1396 if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1397 /* See case 1 below for the cost analysis. For BB->Succ to
1398 * be taken with smaller cost, the following needs to hold:
1399 * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1400 * So the threshold T in the calculation below
1401 * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1402 * So T / (1 - T) = 2, Yielding T = 2/3
1403 * Also adding user specified branch bias, we have
1404 * T = (2/3)*(ProfileLikelyProb/50)
1405 * = (2*ProfileLikelyProb)/150)
1406 */
1407 return BranchProbability(2 * ProfileLikelyProb, 150);
1408 }
1409 }
1411}
1412
1413/// Checks to see if the layout candidate block \p Succ has a better layout
1414/// predecessor than \c BB. If yes, returns true.
1415/// \p SuccProb: The probability adjusted for only remaining blocks.
1416/// Only used for logging
1417/// \p RealSuccProb: The un-adjusted probability.
1418/// \p Chain: The chain that BB belongs to and Succ is being considered for.
1419/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1420/// considered
1421bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1422 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1423 const BlockChain &SuccChain, BranchProbability SuccProb,
1424 BranchProbability RealSuccProb, const BlockChain &Chain,
1425 const BlockFilterSet *BlockFilter) {
1426
1427 // There isn't a better layout when there are no unscheduled predecessors.
1428 if (SuccChain.UnscheduledPredecessors == 0)
1429 return false;
1430
1431 // There are two basic scenarios here:
1432 // -------------------------------------
1433 // Case 1: triangular shape CFG (if-then):
1434 // BB
1435 // | \
1436 // | \
1437 // | Pred
1438 // | /
1439 // Succ
1440 // In this case, we are evaluating whether to select edge -> Succ, e.g.
1441 // set Succ as the layout successor of BB. Picking Succ as BB's
1442 // successor breaks the CFG constraints (FIXME: define these constraints).
1443 // With this layout, Pred BB
1444 // is forced to be outlined, so the overall cost will be cost of the
1445 // branch taken from BB to Pred, plus the cost of back taken branch
1446 // from Pred to Succ, as well as the additional cost associated
1447 // with the needed unconditional jump instruction from Pred To Succ.
1448
1449 // The cost of the topological order layout is the taken branch cost
1450 // from BB to Succ, so to make BB->Succ a viable candidate, the following
1451 // must hold:
1452 // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1453 // < freq(BB->Succ) * taken_branch_cost.
1454 // Ignoring unconditional jump cost, we get
1455 // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1456 // prob(BB->Succ) > 2 * prob(BB->Pred)
1457 //
1458 // When real profile data is available, we can precisely compute the
1459 // probability threshold that is needed for edge BB->Succ to be considered.
1460 // Without profile data, the heuristic requires the branch bias to be
1461 // a lot larger to make sure the signal is very strong (e.g. 80% default).
1462 // -----------------------------------------------------------------
1463 // Case 2: diamond like CFG (if-then-else):
1464 // S
1465 // / \
1466 // | \
1467 // BB Pred
1468 // \ /
1469 // Succ
1470 // ..
1471 //
1472 // The current block is BB and edge BB->Succ is now being evaluated.
1473 // Note that edge S->BB was previously already selected because
1474 // prob(S->BB) > prob(S->Pred).
1475 // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1476 // choose Pred, we will have a topological ordering as shown on the left
1477 // in the picture below. If we choose Succ, we have the solution as shown
1478 // on the right:
1479 //
1480 // topo-order:
1481 //
1482 // S----- ---S
1483 // | | | |
1484 // ---BB | | BB
1485 // | | | |
1486 // | Pred-- | Succ--
1487 // | | | |
1488 // ---Succ ---Pred--
1489 //
1490 // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1491 // = freq(S->Pred) + freq(S->BB)
1492 //
1493 // If we have profile data (i.e, branch probabilities can be trusted), the
1494 // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1495 // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1496 // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1497 // means the cost of topological order is greater.
1498 // When profile data is not available, however, we need to be more
1499 // conservative. If the branch prediction is wrong, breaking the topo-order
1500 // will actually yield a layout with large cost. For this reason, we need
1501 // strong biased branch at block S with Prob(S->BB) in order to select
1502 // BB->Succ. This is equivalent to looking the CFG backward with backward
1503 // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1504 // profile data).
1505 // --------------------------------------------------------------------------
1506 // Case 3: forked diamond
1507 // S
1508 // / \
1509 // / \
1510 // BB Pred
1511 // | \ / |
1512 // | \ / |
1513 // | X |
1514 // | / \ |
1515 // | / \ |
1516 // S1 S2
1517 //
1518 // The current block is BB and edge BB->S1 is now being evaluated.
1519 // As above S->BB was already selected because
1520 // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1521 //
1522 // topo-order:
1523 //
1524 // S-------| ---S
1525 // | | | |
1526 // ---BB | | BB
1527 // | | | |
1528 // | Pred----| | S1----
1529 // | | | |
1530 // --(S1 or S2) ---Pred--
1531 // |
1532 // S2
1533 //
1534 // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1535 // + min(freq(Pred->S1), freq(Pred->S2))
1536 // Non-topo-order cost:
1537 // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1538 // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1539 // is 0. Then the non topo layout is better when
1540 // freq(S->Pred) < freq(BB->S1).
1541 // This is exactly what is checked below.
1542 // Note there are other shapes that apply (Pred may not be a single block,
1543 // but they all fit this general pattern.)
1545
1546 // Make sure that a hot successor doesn't have a globally more
1547 // important predecessor.
1548 BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1549 bool BadCFGConflict = false;
1550
1551 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1552 BlockChain *PredChain = BlockToChain[Pred];
1553 if (Pred == Succ || PredChain == &SuccChain ||
1554 (BlockFilter && !BlockFilter->count(Pred)) ||
1555 PredChain == &Chain || Pred != *std::prev(PredChain->end()) ||
1556 // This check is redundant except for look ahead. This function is
1557 // called for lookahead by isProfitableToTailDup when BB hasn't been
1558 // placed yet.
1559 (Pred == BB))
1560 continue;
1561 // Do backward checking.
1562 // For all cases above, we need a backward checking to filter out edges that
1563 // are not 'strongly' biased.
1564 // BB Pred
1565 // \ /
1566 // Succ
1567 // We select edge BB->Succ if
1568 // freq(BB->Succ) > freq(Succ) * HotProb
1569 // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1570 // HotProb
1571 // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1572 // Case 1 is covered too, because the first equation reduces to:
1573 // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1574 BlockFrequency PredEdgeFreq =
1575 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1576 if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1577 BadCFGConflict = true;
1578 break;
1579 }
1580 }
1581
1582 if (BadCFGConflict) {
1583 LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1584 << SuccProb << " (prob) (non-cold CFG conflict)\n");
1585 return true;
1586 }
1587
1588 return false;
1589}
1590
1591/// Select the best successor for a block.
1592///
1593/// This looks across all successors of a particular block and attempts to
1594/// select the "best" one to be the layout successor. It only considers direct
1595/// successors which also pass the block filter. It will attempt to avoid
1596/// breaking CFG structure, but cave and break such structures in the case of
1597/// very hot successor edges.
1598///
1599/// \returns The best successor block found, or null if none are viable, along
1600/// with a boolean indicating if tail duplication is necessary.
1601MachineBlockPlacement::BlockAndTailDupResult
1602MachineBlockPlacement::selectBestSuccessor(
1603 const MachineBasicBlock *BB, const BlockChain &Chain,
1604 const BlockFilterSet *BlockFilter) {
1605 const BranchProbability HotProb(StaticLikelyProb, 100);
1606
1607 BlockAndTailDupResult BestSucc = { nullptr, false };
1608 auto BestProb = BranchProbability::getZero();
1609
1611 auto AdjustedSumProb =
1612 collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1613
1614 LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1615 << "\n");
1616
1617 // if we already precomputed the best successor for BB, return that if still
1618 // applicable.
1619 auto FoundEdge = ComputedEdges.find(BB);
1620 if (FoundEdge != ComputedEdges.end()) {
1621 MachineBasicBlock *Succ = FoundEdge->second.BB;
1622 ComputedEdges.erase(FoundEdge);
1623 BlockChain *SuccChain = BlockToChain[Succ];
1624 if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1625 SuccChain != &Chain && Succ == *SuccChain->begin())
1626 return FoundEdge->second;
1627 }
1628
1629 // if BB is part of a trellis, Use the trellis to determine the optimal
1630 // fallthrough edges
1631 if (isTrellis(BB, Successors, Chain, BlockFilter))
1632 return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1633 BlockFilter);
1634
1635 // For blocks with CFG violations, we may be able to lay them out anyway with
1636 // tail-duplication. We keep this vector so we can perform the probability
1637 // calculations the minimum number of times.
1639 DupCandidates;
1640 for (MachineBasicBlock *Succ : Successors) {
1641 auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1642 BranchProbability SuccProb =
1643 getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1644
1645 BlockChain &SuccChain = *BlockToChain[Succ];
1646 // Skip the edge \c BB->Succ if block \c Succ has a better layout
1647 // predecessor that yields lower global cost.
1648 if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1649 Chain, BlockFilter)) {
1650 // If tail duplication would make Succ profitable, place it.
1651 if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
1652 DupCandidates.emplace_back(SuccProb, Succ);
1653 continue;
1654 }
1655
1656 LLVM_DEBUG(
1657 dbgs() << " Candidate: " << getBlockName(Succ)
1658 << ", probability: " << SuccProb
1659 << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1660 << "\n");
1661
1662 if (BestSucc.BB && BestProb >= SuccProb) {
1663 LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1664 continue;
1665 }
1666
1667 LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1668 BestSucc.BB = Succ;
1669 BestProb = SuccProb;
1670 }
1671 // Handle the tail duplication candidates in order of decreasing probability.
1672 // Stop at the first one that is profitable. Also stop if they are less
1673 // profitable than BestSucc. Position is important because we preserve it and
1674 // prefer first best match. Here we aren't comparing in order, so we capture
1675 // the position instead.
1676 llvm::stable_sort(DupCandidates,
1677 [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1678 std::tuple<BranchProbability, MachineBasicBlock *> R) {
1679 return std::get<0>(L) > std::get<0>(R);
1680 });
1681 for (auto &Tup : DupCandidates) {
1682 BranchProbability DupProb;
1683 MachineBasicBlock *Succ;
1684 std::tie(DupProb, Succ) = Tup;
1685 if (DupProb < BestProb)
1686 break;
1687 if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1688 && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1689 LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1690 << ", probability: " << DupProb
1691 << " (Tail Duplicate)\n");
1692 BestSucc.BB = Succ;
1693 BestSucc.ShouldTailDup = true;
1694 break;
1695 }
1696 }
1697
1698 if (BestSucc.BB)
1699 LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1700
1701 return BestSucc;
1702}
1703
1704/// Select the best block from a worklist.
1705///
1706/// This looks through the provided worklist as a list of candidate basic
1707/// blocks and select the most profitable one to place. The definition of
1708/// profitable only really makes sense in the context of a loop. This returns
1709/// the most frequently visited block in the worklist, which in the case of
1710/// a loop, is the one most desirable to be physically close to the rest of the
1711/// loop body in order to improve i-cache behavior.
1712///
1713/// \returns The best block found, or null if none are viable.
1714MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1715 const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1716 // Once we need to walk the worklist looking for a candidate, cleanup the
1717 // worklist of already placed entries.
1718 // FIXME: If this shows up on profiles, it could be folded (at the cost of
1719 // some code complexity) into the loop below.
1720 llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) {
1721 return BlockToChain.lookup(BB) == &Chain;
1722 });
1723
1724 if (WorkList.empty())
1725 return nullptr;
1726
1727 bool IsEHPad = WorkList[0]->isEHPad();
1728
1729 MachineBasicBlock *BestBlock = nullptr;
1730 BlockFrequency BestFreq;
1731 for (MachineBasicBlock *MBB : WorkList) {
1732 assert(MBB->isEHPad() == IsEHPad &&
1733 "EHPad mismatch between block and work list.");
1734
1735 BlockChain &SuccChain = *BlockToChain[MBB];
1736 if (&SuccChain == &Chain)
1737 continue;
1738
1739 assert(SuccChain.UnscheduledPredecessors == 0 &&
1740 "Found CFG-violating block");
1741
1742 BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1743 LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> "
1744 << printBlockFreq(MBFI->getMBFI(), CandidateFreq)
1745 << " (freq)\n");
1746
1747 // For ehpad, we layout the least probable first as to avoid jumping back
1748 // from least probable landingpads to more probable ones.
1749 //
1750 // FIXME: Using probability is probably (!) not the best way to achieve
1751 // this. We should probably have a more principled approach to layout
1752 // cleanup code.
1753 //
1754 // The goal is to get:
1755 //
1756 // +--------------------------+
1757 // | V
1758 // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1759 //
1760 // Rather than:
1761 //
1762 // +-------------------------------------+
1763 // V |
1764 // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1765 if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1766 continue;
1767
1768 BestBlock = MBB;
1769 BestFreq = CandidateFreq;
1770 }
1771
1772 return BestBlock;
1773}
1774
1775/// Retrieve the first unplaced basic block in the entire function.
1776///
1777/// This routine is called when we are unable to use the CFG to walk through
1778/// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1779/// We walk through the function's blocks in order, starting from the
1780/// LastUnplacedBlockIt. We update this iterator on each call to avoid
1781/// re-scanning the entire sequence on repeated calls to this routine.
1782MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1783 const BlockChain &PlacedChain,
1784 MachineFunction::iterator &PrevUnplacedBlockIt) {
1785
1786 for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1787 ++I) {
1788 if (BlockToChain[&*I] != &PlacedChain) {
1789 PrevUnplacedBlockIt = I;
1790 // Now select the head of the chain to which the unplaced block belongs
1791 // as the block to place. This will force the entire chain to be placed,
1792 // and satisfies the requirements of merging chains.
1793 return *BlockToChain[&*I]->begin();
1794 }
1795 }
1796 return nullptr;
1797}
1798
1799/// Retrieve the first unplaced basic block among the blocks in BlockFilter.
1800///
1801/// This is similar to getFirstUnplacedBlock for the entire function, but since
1802/// the size of BlockFilter is typically far less than the number of blocks in
1803/// the entire function, iterating through the BlockFilter is more efficient.
1804/// When processing the entire funciton, using the version without BlockFilter
1805/// has a complexity of #(loops in function) * #(blocks in function), while this
1806/// version has a complexity of sum(#(loops in block) foreach block in function)
1807/// which is always smaller. For long function mostly sequential in structure,
1808/// the complexity is amortized to 1 * #(blocks in function).
1809MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1810 const BlockChain &PlacedChain,
1811 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
1812 const BlockFilterSet *BlockFilter) {
1813 assert(BlockFilter);
1814 for (; PrevUnplacedBlockInFilterIt != BlockFilter->end();
1815 ++PrevUnplacedBlockInFilterIt) {
1816 BlockChain *C = BlockToChain[*PrevUnplacedBlockInFilterIt];
1817 if (C != &PlacedChain) {
1818 return *C->begin();
1819 }
1820 }
1821 return nullptr;
1822}
1823
1824void MachineBlockPlacement::fillWorkLists(
1825 const MachineBasicBlock *MBB,
1826 SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1827 const BlockFilterSet *BlockFilter = nullptr) {
1828 BlockChain &Chain = *BlockToChain[MBB];
1829 if (!UpdatedPreds.insert(&Chain).second)
1830 return;
1831
1832 assert(
1833 Chain.UnscheduledPredecessors == 0 &&
1834 "Attempting to place block with unscheduled predecessors in worklist.");
1835 for (MachineBasicBlock *ChainBB : Chain) {
1836 assert(BlockToChain[ChainBB] == &Chain &&
1837 "Block in chain doesn't match BlockToChain map.");
1838 for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1839 if (BlockFilter && !BlockFilter->count(Pred))
1840 continue;
1841 if (BlockToChain[Pred] == &Chain)
1842 continue;
1843 ++Chain.UnscheduledPredecessors;
1844 }
1845 }
1846
1847 if (Chain.UnscheduledPredecessors != 0)
1848 return;
1849
1850 MachineBasicBlock *BB = *Chain.begin();
1851 if (BB->isEHPad())
1852 EHPadWorkList.push_back(BB);
1853 else
1854 BlockWorkList.push_back(BB);
1855}
1856
1857void MachineBlockPlacement::buildChain(
1858 const MachineBasicBlock *HeadBB, BlockChain &Chain,
1859 BlockFilterSet *BlockFilter) {
1860 assert(HeadBB && "BB must not be null.\n");
1861 assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1862 MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1863 BlockFilterSet::iterator PrevUnplacedBlockInFilterIt;
1864 if (BlockFilter)
1865 PrevUnplacedBlockInFilterIt = BlockFilter->begin();
1866
1867 const MachineBasicBlock *LoopHeaderBB = HeadBB;
1868 markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1869 MachineBasicBlock *BB = *std::prev(Chain.end());
1870 while (true) {
1871 assert(BB && "null block found at end of chain in loop.");
1872 assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1873 assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1874
1875
1876 // Look for the best viable successor if there is one to place immediately
1877 // after this block.
1878 auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1879 MachineBasicBlock* BestSucc = Result.BB;
1880 bool ShouldTailDup = Result.ShouldTailDup;
1881 if (allowTailDupPlacement())
1882 ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(BB, BestSucc,
1883 Chain,
1884 BlockFilter));
1885
1886 // If an immediate successor isn't available, look for the best viable
1887 // block among those we've identified as not violating the loop's CFG at
1888 // this point. This won't be a fallthrough, but it will increase locality.
1889 if (!BestSucc)
1890 BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1891 if (!BestSucc)
1892 BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1893
1894 if (!BestSucc) {
1895 if (BlockFilter)
1896 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockInFilterIt,
1897 BlockFilter);
1898 else
1899 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt);
1900 if (!BestSucc)
1901 break;
1902
1903 LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1904 "layout successor until the CFG reduces\n");
1905 }
1906
1907 // Placement may have changed tail duplication opportunities.
1908 // Check for that now.
1909 if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
1910 repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1911 BlockFilter, PrevUnplacedBlockIt,
1912 PrevUnplacedBlockInFilterIt);
1913 // If the chosen successor was duplicated into BB, don't bother laying
1914 // it out, just go round the loop again with BB as the chain end.
1915 if (!BB->isSuccessor(BestSucc))
1916 continue;
1917 }
1918
1919 // Place this block, updating the datastructures to reflect its placement.
1920 BlockChain &SuccChain = *BlockToChain[BestSucc];
1921 // Zero out UnscheduledPredecessors for the successor we're about to merge in case
1922 // we selected a successor that didn't fit naturally into the CFG.
1923 SuccChain.UnscheduledPredecessors = 0;
1924 LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1925 << getBlockName(BestSucc) << "\n");
1926 markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1927 Chain.merge(BestSucc, &SuccChain);
1928 BB = *std::prev(Chain.end());
1929 }
1930
1931 LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1932 << getBlockName(*Chain.begin()) << "\n");
1933}
1934
1935// If bottom of block BB has only one successor OldTop, in most cases it is
1936// profitable to move it before OldTop, except the following case:
1937//
1938// -->OldTop<-
1939// | . |
1940// | . |
1941// | . |
1942// ---Pred |
1943// | |
1944// BB-----
1945//
1946// If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
1947// layout the other successor below it, so it can't reduce taken branch.
1948// In this case we keep its original layout.
1949bool
1950MachineBlockPlacement::canMoveBottomBlockToTop(
1951 const MachineBasicBlock *BottomBlock,
1952 const MachineBasicBlock *OldTop) {
1953 if (BottomBlock->pred_size() != 1)
1954 return true;
1955 MachineBasicBlock *Pred = *BottomBlock->pred_begin();
1956 if (Pred->succ_size() != 2)
1957 return true;
1958
1959 MachineBasicBlock *OtherBB = *Pred->succ_begin();
1960 if (OtherBB == BottomBlock)
1961 OtherBB = *Pred->succ_rbegin();
1962 if (OtherBB == OldTop)
1963 return false;
1964
1965 return true;
1966}
1967
1968// Find out the possible fall through frequence to the top of a loop.
1970MachineBlockPlacement::TopFallThroughFreq(
1971 const MachineBasicBlock *Top,
1972 const BlockFilterSet &LoopBlockSet) {
1973 BlockFrequency MaxFreq = BlockFrequency(0);
1974 for (MachineBasicBlock *Pred : Top->predecessors()) {
1975 BlockChain *PredChain = BlockToChain[Pred];
1976 if (!LoopBlockSet.count(Pred) &&
1977 (!PredChain || Pred == *std::prev(PredChain->end()))) {
1978 // Found a Pred block can be placed before Top.
1979 // Check if Top is the best successor of Pred.
1980 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
1981 bool TopOK = true;
1982 for (MachineBasicBlock *Succ : Pred->successors()) {
1983 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
1984 BlockChain *SuccChain = BlockToChain[Succ];
1985 // Check if Succ can be placed after Pred.
1986 // Succ should not be in any chain, or it is the head of some chain.
1987 if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
1988 (!SuccChain || Succ == *SuccChain->begin())) {
1989 TopOK = false;
1990 break;
1991 }
1992 }
1993 if (TopOK) {
1994 BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
1995 MBPI->getEdgeProbability(Pred, Top);
1996 if (EdgeFreq > MaxFreq)
1997 MaxFreq = EdgeFreq;
1998 }
1999 }
2000 }
2001 return MaxFreq;
2002}
2003
2004// Compute the fall through gains when move NewTop before OldTop.
2005//
2006// In following diagram, edges marked as "-" are reduced fallthrough, edges
2007// marked as "+" are increased fallthrough, this function computes
2008//
2009// SUM(increased fallthrough) - SUM(decreased fallthrough)
2010//
2011// |
2012// | -
2013// V
2014// --->OldTop
2015// | .
2016// | .
2017// +| . +
2018// | Pred --->
2019// | |-
2020// | V
2021// --- NewTop <---
2022// |-
2023// V
2024//
2026MachineBlockPlacement::FallThroughGains(
2027 const MachineBasicBlock *NewTop,
2028 const MachineBasicBlock *OldTop,
2029 const MachineBasicBlock *ExitBB,
2030 const BlockFilterSet &LoopBlockSet) {
2031 BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
2032 BlockFrequency FallThrough2Exit = BlockFrequency(0);
2033 if (ExitBB)
2034 FallThrough2Exit = MBFI->getBlockFreq(NewTop) *
2035 MBPI->getEdgeProbability(NewTop, ExitBB);
2036 BlockFrequency BackEdgeFreq = MBFI->getBlockFreq(NewTop) *
2037 MBPI->getEdgeProbability(NewTop, OldTop);
2038
2039 // Find the best Pred of NewTop.
2040 MachineBasicBlock *BestPred = nullptr;
2041 BlockFrequency FallThroughFromPred = BlockFrequency(0);
2042 for (MachineBasicBlock *Pred : NewTop->predecessors()) {
2043 if (!LoopBlockSet.count(Pred))
2044 continue;
2045 BlockChain *PredChain = BlockToChain[Pred];
2046 if (!PredChain || Pred == *std::prev(PredChain->end())) {
2047 BlockFrequency EdgeFreq =
2048 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, NewTop);
2049 if (EdgeFreq > FallThroughFromPred) {
2050 FallThroughFromPred = EdgeFreq;
2051 BestPred = Pred;
2052 }
2053 }
2054 }
2055
2056 // If NewTop is not placed after Pred, another successor can be placed
2057 // after Pred.
2058 BlockFrequency NewFreq = BlockFrequency(0);
2059 if (BestPred) {
2060 for (MachineBasicBlock *Succ : BestPred->successors()) {
2061 if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
2062 continue;
2063 if (ComputedEdges.contains(Succ))
2064 continue;
2065 BlockChain *SuccChain = BlockToChain[Succ];
2066 if ((SuccChain && (Succ != *SuccChain->begin())) ||
2067 (SuccChain == BlockToChain[BestPred]))
2068 continue;
2069 BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
2070 MBPI->getEdgeProbability(BestPred, Succ);
2071 if (EdgeFreq > NewFreq)
2072 NewFreq = EdgeFreq;
2073 }
2074 BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
2075 MBPI->getEdgeProbability(BestPred, NewTop);
2076 if (NewFreq > OrigEdgeFreq) {
2077 // If NewTop is not the best successor of Pred, then Pred doesn't
2078 // fallthrough to NewTop. So there is no FallThroughFromPred and
2079 // NewFreq.
2080 NewFreq = BlockFrequency(0);
2081 FallThroughFromPred = BlockFrequency(0);
2082 }
2083 }
2084
2086 BlockFrequency Gains = BackEdgeFreq + NewFreq;
2087 BlockFrequency Lost =
2088 FallThrough2Top + FallThrough2Exit + FallThroughFromPred;
2089 if (Gains > Lost)
2090 Result = Gains - Lost;
2091 return Result;
2092}
2093
2094/// Helper function of findBestLoopTop. Find the best loop top block
2095/// from predecessors of old top.
2096///
2097/// Look for a block which is strictly better than the old top for laying
2098/// out before the old top of the loop. This looks for only two patterns:
2099///
2100/// 1. a block has only one successor, the old loop top
2101///
2102/// Because such a block will always result in an unconditional jump,
2103/// rotating it in front of the old top is always profitable.
2104///
2105/// 2. a block has two successors, one is old top, another is exit
2106/// and it has more than one predecessors
2107///
2108/// If it is below one of its predecessors P, only P can fall through to
2109/// it, all other predecessors need a jump to it, and another conditional
2110/// jump to loop header. If it is moved before loop header, all its
2111/// predecessors jump to it, then fall through to loop header. So all its
2112/// predecessors except P can reduce one taken branch.
2113/// At the same time, move it before old top increases the taken branch
2114/// to loop exit block, so the reduced taken branch will be compared with
2115/// the increased taken branch to the loop exit block.
2117MachineBlockPlacement::findBestLoopTopHelper(
2118 MachineBasicBlock *OldTop,
2119 const MachineLoop &L,
2120 const BlockFilterSet &LoopBlockSet) {
2121 // Check that the header hasn't been fused with a preheader block due to
2122 // crazy branches. If it has, we need to start with the header at the top to
2123 // prevent pulling the preheader into the loop body.
2124 BlockChain &HeaderChain = *BlockToChain[OldTop];
2125 if (!LoopBlockSet.count(*HeaderChain.begin()))
2126 return OldTop;
2127 if (OldTop != *HeaderChain.begin())
2128 return OldTop;
2129
2130 LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
2131 << "\n");
2132
2133 BlockFrequency BestGains = BlockFrequency(0);
2134 MachineBasicBlock *BestPred = nullptr;
2135 for (MachineBasicBlock *Pred : OldTop->predecessors()) {
2136 if (!LoopBlockSet.count(Pred))
2137 continue;
2138 if (Pred == L.getHeader())
2139 continue;
2140 LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
2141 << Pred->succ_size() << " successors, "
2142 << printBlockFreq(MBFI->getMBFI(), *Pred) << " freq\n");
2143 if (Pred->succ_size() > 2)
2144 continue;
2145
2146 MachineBasicBlock *OtherBB = nullptr;
2147 if (Pred->succ_size() == 2) {
2148 OtherBB = *Pred->succ_begin();
2149 if (OtherBB == OldTop)
2150 OtherBB = *Pred->succ_rbegin();
2151 }
2152
2153 if (!canMoveBottomBlockToTop(Pred, OldTop))
2154 continue;
2155
2156 BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
2157 LoopBlockSet);
2158 if ((Gains > BlockFrequency(0)) &&
2159 (Gains > BestGains ||
2160 ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
2161 BestPred = Pred;
2162 BestGains = Gains;
2163 }
2164 }
2165
2166 // If no direct predecessor is fine, just use the loop header.
2167 if (!BestPred) {
2168 LLVM_DEBUG(dbgs() << " final top unchanged\n");
2169 return OldTop;
2170 }
2171
2172 // Walk backwards through any straight line of predecessors.
2173 while (BestPred->pred_size() == 1 &&
2174 (*BestPred->pred_begin())->succ_size() == 1 &&
2175 *BestPred->pred_begin() != L.getHeader())
2176 BestPred = *BestPred->pred_begin();
2177
2178 LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2179 return BestPred;
2180}
2181
2182/// Find the best loop top block for layout.
2183///
2184/// This function iteratively calls findBestLoopTopHelper, until no new better
2185/// BB can be found.
2187MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2188 const BlockFilterSet &LoopBlockSet) {
2189 // Placing the latch block before the header may introduce an extra branch
2190 // that skips this block the first time the loop is executed, which we want
2191 // to avoid when optimising for size.
2192 // FIXME: in theory there is a case that does not introduce a new branch,
2193 // i.e. when the layout predecessor does not fallthrough to the loop header.
2194 // In practice this never happens though: there always seems to be a preheader
2195 // that can fallthrough and that is also placed before the header.
2196 bool OptForSize = F->getFunction().hasOptSize() ||
2197 llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get());
2198 if (OptForSize)
2199 return L.getHeader();
2200
2201 MachineBasicBlock *OldTop = nullptr;
2202 MachineBasicBlock *NewTop = L.getHeader();
2203 while (NewTop != OldTop) {
2204 OldTop = NewTop;
2205 NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2206 if (NewTop != OldTop)
2207 ComputedEdges[NewTop] = { OldTop, false };
2208 }
2209 return NewTop;
2210}
2211
2212/// Find the best loop exiting block for layout.
2213///
2214/// This routine implements the logic to analyze the loop looking for the best
2215/// block to layout at the top of the loop. Typically this is done to maximize
2216/// fallthrough opportunities.
2218MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2219 const BlockFilterSet &LoopBlockSet,
2220 BlockFrequency &ExitFreq) {
2221 // We don't want to layout the loop linearly in all cases. If the loop header
2222 // is just a normal basic block in the loop, we want to look for what block
2223 // within the loop is the best one to layout at the top. However, if the loop
2224 // header has be pre-merged into a chain due to predecessors not having
2225 // analyzable branches, *and* the predecessor it is merged with is *not* part
2226 // of the loop, rotating the header into the middle of the loop will create
2227 // a non-contiguous range of blocks which is Very Bad. So start with the
2228 // header and only rotate if safe.
2229 BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2230 if (!LoopBlockSet.count(*HeaderChain.begin()))
2231 return nullptr;
2232
2233 BlockFrequency BestExitEdgeFreq;
2234 unsigned BestExitLoopDepth = 0;
2235 MachineBasicBlock *ExitingBB = nullptr;
2236 // If there are exits to outer loops, loop rotation can severely limit
2237 // fallthrough opportunities unless it selects such an exit. Keep a set of
2238 // blocks where rotating to exit with that block will reach an outer loop.
2239 SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2240
2241 LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2242 << getBlockName(L.getHeader()) << "\n");
2243 for (MachineBasicBlock *MBB : L.getBlocks()) {
2244 BlockChain &Chain = *BlockToChain[MBB];
2245 // Ensure that this block is at the end of a chain; otherwise it could be
2246 // mid-way through an inner loop or a successor of an unanalyzable branch.
2247 if (MBB != *std::prev(Chain.end()))
2248 continue;
2249
2250 // Now walk the successors. We need to establish whether this has a viable
2251 // exiting successor and whether it has a viable non-exiting successor.
2252 // We store the old exiting state and restore it if a viable looping
2253 // successor isn't found.
2254 MachineBasicBlock *OldExitingBB = ExitingBB;
2255 BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2256 bool HasLoopingSucc = false;
2257 for (MachineBasicBlock *Succ : MBB->successors()) {
2258 if (Succ->isEHPad())
2259 continue;
2260 if (Succ == MBB)
2261 continue;
2262 BlockChain &SuccChain = *BlockToChain[Succ];
2263 // Don't split chains, either this chain or the successor's chain.
2264 if (&Chain == &SuccChain) {
2265 LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2266 << getBlockName(Succ) << " (chain conflict)\n");
2267 continue;
2268 }
2269
2270 auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2271 if (LoopBlockSet.count(Succ)) {
2272 LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2273 << getBlockName(Succ) << " (" << SuccProb << ")\n");
2274 HasLoopingSucc = true;
2275 continue;
2276 }
2277
2278 unsigned SuccLoopDepth = 0;
2279 if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2280 SuccLoopDepth = ExitLoop->getLoopDepth();
2281 if (ExitLoop->contains(&L))
2282 BlocksExitingToOuterLoop.insert(MBB);
2283 }
2284
2285 BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2286 LLVM_DEBUG(
2287 dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2288 << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("
2289 << printBlockFreq(MBFI->getMBFI(), ExitEdgeFreq) << ")\n");
2290 // Note that we bias this toward an existing layout successor to retain
2291 // incoming order in the absence of better information. The exit must have
2292 // a frequency higher than the current exit before we consider breaking
2293 // the layout.
2294 BranchProbability Bias(100 - ExitBlockBias, 100);
2295 if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2296 ExitEdgeFreq > BestExitEdgeFreq ||
2297 (MBB->isLayoutSuccessor(Succ) &&
2298 !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2299 BestExitEdgeFreq = ExitEdgeFreq;
2300 ExitingBB = MBB;
2301 }
2302 }
2303
2304 if (!HasLoopingSucc) {
2305 // Restore the old exiting state, no viable looping successor was found.
2306 ExitingBB = OldExitingBB;
2307 BestExitEdgeFreq = OldBestExitEdgeFreq;
2308 }
2309 }
2310 // Without a candidate exiting block or with only a single block in the
2311 // loop, just use the loop header to layout the loop.
2312 if (!ExitingBB) {
2313 LLVM_DEBUG(
2314 dbgs() << " No other candidate exit blocks, using loop header\n");
2315 return nullptr;
2316 }
2317 if (L.getNumBlocks() == 1) {
2318 LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2319 return nullptr;
2320 }
2321
2322 // Also, if we have exit blocks which lead to outer loops but didn't select
2323 // one of them as the exiting block we are rotating toward, disable loop
2324 // rotation altogether.
2325 if (!BlocksExitingToOuterLoop.empty() &&
2326 !BlocksExitingToOuterLoop.count(ExitingBB))
2327 return nullptr;
2328
2329 LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2330 << "\n");
2331 ExitFreq = BestExitEdgeFreq;
2332 return ExitingBB;
2333}
2334
2335/// Check if there is a fallthrough to loop header Top.
2336///
2337/// 1. Look for a Pred that can be layout before Top.
2338/// 2. Check if Top is the most possible successor of Pred.
2339bool
2340MachineBlockPlacement::hasViableTopFallthrough(
2341 const MachineBasicBlock *Top,
2342 const BlockFilterSet &LoopBlockSet) {
2343 for (MachineBasicBlock *Pred : Top->predecessors()) {
2344 BlockChain *PredChain = BlockToChain[Pred];
2345 if (!LoopBlockSet.count(Pred) &&
2346 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2347 // Found a Pred block can be placed before Top.
2348 // Check if Top is the best successor of Pred.
2349 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2350 bool TopOK = true;
2351 for (MachineBasicBlock *Succ : Pred->successors()) {
2352 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2353 BlockChain *SuccChain = BlockToChain[Succ];
2354 // Check if Succ can be placed after Pred.
2355 // Succ should not be in any chain, or it is the head of some chain.
2356 if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2357 TopOK = false;
2358 break;
2359 }
2360 }
2361 if (TopOK)
2362 return true;
2363 }
2364 }
2365 return false;
2366}
2367
2368/// Attempt to rotate an exiting block to the bottom of the loop.
2369///
2370/// Once we have built a chain, try to rotate it to line up the hot exit block
2371/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2372/// branches. For example, if the loop has fallthrough into its header and out
2373/// of its bottom already, don't rotate it.
2374void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2375 const MachineBasicBlock *ExitingBB,
2376 BlockFrequency ExitFreq,
2377 const BlockFilterSet &LoopBlockSet) {
2378 if (!ExitingBB)
2379 return;
2380
2381 MachineBasicBlock *Top = *LoopChain.begin();
2382 MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2383
2384 // If ExitingBB is already the last one in a chain then nothing to do.
2385 if (Bottom == ExitingBB)
2386 return;
2387
2388 // The entry block should always be the first BB in a function.
2389 if (Top->isEntryBlock())
2390 return;
2391
2392 bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2393
2394 // If the header has viable fallthrough, check whether the current loop
2395 // bottom is a viable exiting block. If so, bail out as rotating will
2396 // introduce an unnecessary branch.
2397 if (ViableTopFallthrough) {
2398 for (MachineBasicBlock *Succ : Bottom->successors()) {
2399 BlockChain *SuccChain = BlockToChain[Succ];
2400 if (!LoopBlockSet.count(Succ) &&
2401 (!SuccChain || Succ == *SuccChain->begin()))
2402 return;
2403 }
2404
2405 // Rotate will destroy the top fallthrough, we need to ensure the new exit
2406 // frequency is larger than top fallthrough.
2407 BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2408 if (FallThrough2Top >= ExitFreq)
2409 return;
2410 }
2411
2412 BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2413 if (ExitIt == LoopChain.end())
2414 return;
2415
2416 // Rotating a loop exit to the bottom when there is a fallthrough to top
2417 // trades the entry fallthrough for an exit fallthrough.
2418 // If there is no bottom->top edge, but the chosen exit block does have
2419 // a fallthrough, we break that fallthrough for nothing in return.
2420
2421 // Let's consider an example. We have a built chain of basic blocks
2422 // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2423 // By doing a rotation we get
2424 // Bk+1, ..., Bn, B1, ..., Bk
2425 // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2426 // If we had a fallthrough Bk -> Bk+1 it is broken now.
2427 // It might be compensated by fallthrough Bn -> B1.
2428 // So we have a condition to avoid creation of extra branch by loop rotation.
2429 // All below must be true to avoid loop rotation:
2430 // If there is a fallthrough to top (B1)
2431 // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2432 // There is no fallthrough from bottom (Bn) to top (B1).
2433 // Please note that there is no exit fallthrough from Bn because we checked it
2434 // above.
2435 if (ViableTopFallthrough) {
2436 assert(std::next(ExitIt) != LoopChain.end() &&
2437 "Exit should not be last BB");
2438 MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2439 if (ExitingBB->isSuccessor(NextBlockInChain))
2440 if (!Bottom->isSuccessor(Top))
2441 return;
2442 }
2443
2444 LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2445 << " at bottom\n");
2446 std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2447}
2448
2449/// Attempt to rotate a loop based on profile data to reduce branch cost.
2450///
2451/// With profile data, we can determine the cost in terms of missed fall through
2452/// opportunities when rotating a loop chain and select the best rotation.
2453/// Basically, there are three kinds of cost to consider for each rotation:
2454/// 1. The possibly missed fall through edge (if it exists) from BB out of
2455/// the loop to the loop header.
2456/// 2. The possibly missed fall through edges (if they exist) from the loop
2457/// exits to BB out of the loop.
2458/// 3. The missed fall through edge (if it exists) from the last BB to the
2459/// first BB in the loop chain.
2460/// Therefore, the cost for a given rotation is the sum of costs listed above.
2461/// We select the best rotation with the smallest cost.
2462void MachineBlockPlacement::rotateLoopWithProfile(
2463 BlockChain &LoopChain, const MachineLoop &L,
2464 const BlockFilterSet &LoopBlockSet) {
2465 auto RotationPos = LoopChain.end();
2466 MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2467
2468 // The entry block should always be the first BB in a function.
2469 if (ChainHeaderBB->isEntryBlock())
2470 return;
2471
2472 BlockFrequency SmallestRotationCost = BlockFrequency::max();
2473
2474 // A utility lambda that scales up a block frequency by dividing it by a
2475 // branch probability which is the reciprocal of the scale.
2476 auto ScaleBlockFrequency = [](BlockFrequency Freq,
2477 unsigned Scale) -> BlockFrequency {
2478 if (Scale == 0)
2479 return BlockFrequency(0);
2480 // Use operator / between BlockFrequency and BranchProbability to implement
2481 // saturating multiplication.
2482 return Freq / BranchProbability(1, Scale);
2483 };
2484
2485 // Compute the cost of the missed fall-through edge to the loop header if the
2486 // chain head is not the loop header. As we only consider natural loops with
2487 // single header, this computation can be done only once.
2488 BlockFrequency HeaderFallThroughCost(0);
2489 for (auto *Pred : ChainHeaderBB->predecessors()) {
2490 BlockChain *PredChain = BlockToChain[Pred];
2491 if (!LoopBlockSet.count(Pred) &&
2492 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2493 auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2494 MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2495 auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2496 // If the predecessor has only an unconditional jump to the header, we
2497 // need to consider the cost of this jump.
2498 if (Pred->succ_size() == 1)
2499 FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2500 HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2501 }
2502 }
2503
2504 // Here we collect all exit blocks in the loop, and for each exit we find out
2505 // its hottest exit edge. For each loop rotation, we define the loop exit cost
2506 // as the sum of frequencies of exit edges we collect here, excluding the exit
2507 // edge from the tail of the loop chain.
2509 for (auto *BB : LoopChain) {
2510 auto LargestExitEdgeProb = BranchProbability::getZero();
2511 for (auto *Succ : BB->successors()) {
2512 BlockChain *SuccChain = BlockToChain[Succ];
2513 if (!LoopBlockSet.count(Succ) &&
2514 (!SuccChain || Succ == *SuccChain->begin())) {
2515 auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2516 LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2517 }
2518 }
2519 if (LargestExitEdgeProb > BranchProbability::getZero()) {
2520 auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2521 ExitsWithFreq.emplace_back(BB, ExitFreq);
2522 }
2523 }
2524
2525 // In this loop we iterate every block in the loop chain and calculate the
2526 // cost assuming the block is the head of the loop chain. When the loop ends,
2527 // we should have found the best candidate as the loop chain's head.
2528 for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2529 EndIter = LoopChain.end();
2530 Iter != EndIter; Iter++, TailIter++) {
2531 // TailIter is used to track the tail of the loop chain if the block we are
2532 // checking (pointed by Iter) is the head of the chain.
2533 if (TailIter == LoopChain.end())
2534 TailIter = LoopChain.begin();
2535
2536 auto TailBB = *TailIter;
2537
2538 // Calculate the cost by putting this BB to the top.
2540
2541 // If the current BB is the loop header, we need to take into account the
2542 // cost of the missed fall through edge from outside of the loop to the
2543 // header.
2544 if (Iter != LoopChain.begin())
2545 Cost += HeaderFallThroughCost;
2546
2547 // Collect the loop exit cost by summing up frequencies of all exit edges
2548 // except the one from the chain tail.
2549 for (auto &ExitWithFreq : ExitsWithFreq)
2550 if (TailBB != ExitWithFreq.first)
2551 Cost += ExitWithFreq.second;
2552
2553 // The cost of breaking the once fall-through edge from the tail to the top
2554 // of the loop chain. Here we need to consider three cases:
2555 // 1. If the tail node has only one successor, then we will get an
2556 // additional jmp instruction. So the cost here is (MisfetchCost +
2557 // JumpInstCost) * tail node frequency.
2558 // 2. If the tail node has two successors, then we may still get an
2559 // additional jmp instruction if the layout successor after the loop
2560 // chain is not its CFG successor. Note that the more frequently executed
2561 // jmp instruction will be put ahead of the other one. Assume the
2562 // frequency of those two branches are x and y, where x is the frequency
2563 // of the edge to the chain head, then the cost will be
2564 // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2565 // 3. If the tail node has more than two successors (this rarely happens),
2566 // we won't consider any additional cost.
2567 if (TailBB->isSuccessor(*Iter)) {
2568 auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2569 if (TailBB->succ_size() == 1)
2570 Cost += ScaleBlockFrequency(TailBBFreq, MisfetchCost + JumpInstCost);
2571 else if (TailBB->succ_size() == 2) {
2572 auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2573 auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2574 auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2575 ? TailBBFreq * TailToHeadProb.getCompl()
2576 : TailToHeadFreq;
2577 Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2578 ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2579 }
2580 }
2581
2582 LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2583 << getBlockName(*Iter) << " to the top: "
2584 << printBlockFreq(MBFI->getMBFI(), Cost) << "\n");
2585
2586 if (Cost < SmallestRotationCost) {
2587 SmallestRotationCost = Cost;
2588 RotationPos = Iter;
2589 }
2590 }
2591
2592 if (RotationPos != LoopChain.end()) {
2593 LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2594 << " to the top\n");
2595 std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2596 }
2597}
2598
2599/// Collect blocks in the given loop that are to be placed.
2600///
2601/// When profile data is available, exclude cold blocks from the returned set;
2602/// otherwise, collect all blocks in the loop.
2604MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2605 // Collect the blocks in a set ordered by block number, as this gives the same
2606 // order as they appear in the function.
2607 struct MBBCompare {
2608 bool operator()(const MachineBasicBlock *X,
2609 const MachineBasicBlock *Y) const {
2610 return X->getNumber() < Y->getNumber();
2611 }
2612 };
2613 std::set<const MachineBasicBlock *, MBBCompare> LoopBlockSet;
2614
2615 // Filter cold blocks off from LoopBlockSet when profile data is available.
2616 // Collect the sum of frequencies of incoming edges to the loop header from
2617 // outside. If we treat the loop as a super block, this is the frequency of
2618 // the loop. Then for each block in the loop, we calculate the ratio between
2619 // its frequency and the frequency of the loop block. When it is too small,
2620 // don't add it to the loop chain. If there are outer loops, then this block
2621 // will be merged into the first outer loop chain for which this block is not
2622 // cold anymore. This needs precise profile data and we only do this when
2623 // profile data is available.
2624 if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
2625 BlockFrequency LoopFreq(0);
2626 for (auto *LoopPred : L.getHeader()->predecessors())
2627 if (!L.contains(LoopPred))
2628 LoopFreq += MBFI->getBlockFreq(LoopPred) *
2629 MBPI->getEdgeProbability(LoopPred, L.getHeader());
2630
2631 for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2632 if (LoopBlockSet.count(LoopBB))
2633 continue;
2634 auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2635 if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2636 continue;
2637 BlockChain *Chain = BlockToChain[LoopBB];
2638 for (MachineBasicBlock *ChainBB : *Chain)
2639 LoopBlockSet.insert(ChainBB);
2640 }
2641 } else
2642 LoopBlockSet.insert(L.block_begin(), L.block_end());
2643
2644 // Copy the blocks into a BlockFilterSet, as iterating it is faster than
2645 // std::set. We will only remove blocks and never insert them, which will
2646 // preserve the ordering.
2647 BlockFilterSet Ret(LoopBlockSet.begin(), LoopBlockSet.end());
2648 return Ret;
2649}
2650
2651/// Forms basic block chains from the natural loop structures.
2652///
2653/// These chains are designed to preserve the existing *structure* of the code
2654/// as much as possible. We can then stitch the chains together in a way which
2655/// both preserves the topological structure and minimizes taken conditional
2656/// branches.
2657void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2658 // First recurse through any nested loops, building chains for those inner
2659 // loops.
2660 for (const MachineLoop *InnerLoop : L)
2661 buildLoopChains(*InnerLoop);
2662
2663 assert(BlockWorkList.empty() &&
2664 "BlockWorkList not empty when starting to build loop chains.");
2665 assert(EHPadWorkList.empty() &&
2666 "EHPadWorkList not empty when starting to build loop chains.");
2667 BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2668
2669 // Check if we have profile data for this function. If yes, we will rotate
2670 // this loop by modeling costs more precisely which requires the profile data
2671 // for better layout.
2672 bool RotateLoopWithProfile =
2674 (PreciseRotationCost && F->getFunction().hasProfileData());
2675
2676 // First check to see if there is an obviously preferable top block for the
2677 // loop. This will default to the header, but may end up as one of the
2678 // predecessors to the header if there is one which will result in strictly
2679 // fewer branches in the loop body.
2680 MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2681
2682 // If we selected just the header for the loop top, look for a potentially
2683 // profitable exit block in the event that rotating the loop can eliminate
2684 // branches by placing an exit edge at the bottom.
2685 //
2686 // Loops are processed innermost to uttermost, make sure we clear
2687 // PreferredLoopExit before processing a new loop.
2688 PreferredLoopExit = nullptr;
2689 BlockFrequency ExitFreq;
2690 if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2691 PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2692
2693 BlockChain &LoopChain = *BlockToChain[LoopTop];
2694
2695 // FIXME: This is a really lame way of walking the chains in the loop: we
2696 // walk the blocks, and use a set to prevent visiting a particular chain
2697 // twice.
2698 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2699 assert(LoopChain.UnscheduledPredecessors == 0 &&
2700 "LoopChain should not have unscheduled predecessors.");
2701 UpdatedPreds.insert(&LoopChain);
2702
2703 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2704 fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2705
2706 buildChain(LoopTop, LoopChain, &LoopBlockSet);
2707
2708 if (RotateLoopWithProfile)
2709 rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2710 else
2711 rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2712
2713 LLVM_DEBUG({
2714 // Crash at the end so we get all of the debugging output first.
2715 bool BadLoop = false;
2716 if (LoopChain.UnscheduledPredecessors) {
2717 BadLoop = true;
2718 dbgs() << "Loop chain contains a block without its preds placed!\n"
2719 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2720 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2721 }
2722 for (MachineBasicBlock *ChainBB : LoopChain) {
2723 dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2724 if (!LoopBlockSet.remove(ChainBB)) {
2725 // We don't mark the loop as bad here because there are real situations
2726 // where this can occur. For example, with an unanalyzable fallthrough
2727 // from a loop block to a non-loop block or vice versa.
2728 dbgs() << "Loop chain contains a block not contained by the loop!\n"
2729 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2730 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2731 << " Bad block: " << getBlockName(ChainBB) << "\n";
2732 }
2733 }
2734
2735 if (!LoopBlockSet.empty()) {
2736 BadLoop = true;
2737 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2738 dbgs() << "Loop contains blocks never placed into a chain!\n"
2739 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2740 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2741 << " Bad block: " << getBlockName(LoopBB) << "\n";
2742 }
2743 assert(!BadLoop && "Detected problems with the placement of this loop.");
2744 });
2745
2746 BlockWorkList.clear();
2747 EHPadWorkList.clear();
2748}
2749
2750void MachineBlockPlacement::buildCFGChains() {
2751 // Ensure that every BB in the function has an associated chain to simplify
2752 // the assumptions of the remaining algorithm.
2753 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2754 for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2755 ++FI) {
2756 MachineBasicBlock *BB = &*FI;
2757 BlockChain *Chain =
2758 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2759 // Also, merge any blocks which we cannot reason about and must preserve
2760 // the exact fallthrough behavior for.
2761 while (true) {
2762 Cond.clear();
2763 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2764 if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2765 break;
2766
2767 MachineFunction::iterator NextFI = std::next(FI);
2768 MachineBasicBlock *NextBB = &*NextFI;
2769 // Ensure that the layout successor is a viable block, as we know that
2770 // fallthrough is a possibility.
2771 assert(NextFI != FE && "Can't fallthrough past the last block.");
2772 LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2773 << getBlockName(BB) << " -> " << getBlockName(NextBB)
2774 << "\n");
2775 Chain->merge(NextBB, nullptr);
2776#ifndef NDEBUG
2777 BlocksWithUnanalyzableExits.insert(&*BB);
2778#endif
2779 FI = NextFI;
2780 BB = NextBB;
2781 }
2782 }
2783
2784 // Build any loop-based chains.
2785 PreferredLoopExit = nullptr;
2786 for (MachineLoop *L : *MLI)
2787 buildLoopChains(*L);
2788
2789 assert(BlockWorkList.empty() &&
2790 "BlockWorkList should be empty before building final chain.");
2791 assert(EHPadWorkList.empty() &&
2792 "EHPadWorkList should be empty before building final chain.");
2793
2794 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2795 for (MachineBasicBlock &MBB : *F)
2796 fillWorkLists(&MBB, UpdatedPreds);
2797
2798 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2799 buildChain(&F->front(), FunctionChain);
2800
2801#ifndef NDEBUG
2802 using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2803#endif
2804 LLVM_DEBUG({
2805 // Crash at the end so we get all of the debugging output first.
2806 bool BadFunc = false;
2807 FunctionBlockSetType FunctionBlockSet;
2808 for (MachineBasicBlock &MBB : *F)
2809 FunctionBlockSet.insert(&MBB);
2810
2811 for (MachineBasicBlock *ChainBB : FunctionChain)
2812 if (!FunctionBlockSet.erase(ChainBB)) {
2813 BadFunc = true;
2814 dbgs() << "Function chain contains a block not in the function!\n"
2815 << " Bad block: " << getBlockName(ChainBB) << "\n";
2816 }
2817
2818 if (!FunctionBlockSet.empty()) {
2819 BadFunc = true;
2820 for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2821 dbgs() << "Function contains blocks never placed into a chain!\n"
2822 << " Bad block: " << getBlockName(RemainingBB) << "\n";
2823 }
2824 assert(!BadFunc && "Detected problems with the block placement.");
2825 });
2826
2827 // Remember original layout ordering, so we can update terminators after
2828 // reordering to point to the original layout successor.
2829 SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
2830 F->getNumBlockIDs());
2831 {
2832 MachineBasicBlock *LastMBB = nullptr;
2833 for (auto &MBB : *F) {
2834 if (LastMBB != nullptr)
2835 OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
2836 LastMBB = &MBB;
2837 }
2838 OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
2839 }
2840
2841 // Splice the blocks into place.
2842 MachineFunction::iterator InsertPos = F->begin();
2843 LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2844 for (MachineBasicBlock *ChainBB : FunctionChain) {
2845 LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2846 : " ... ")
2847 << getBlockName(ChainBB) << "\n");
2848 if (InsertPos != MachineFunction::iterator(ChainBB))
2849 F->splice(InsertPos, ChainBB);
2850 else
2851 ++InsertPos;
2852
2853 // Update the terminator of the previous block.
2854 if (ChainBB == *FunctionChain.begin())
2855 continue;
2856 MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2857
2858 // FIXME: It would be awesome of updateTerminator would just return rather
2859 // than assert when the branch cannot be analyzed in order to remove this
2860 // boiler plate.
2861 Cond.clear();
2862 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2863
2864#ifndef NDEBUG
2865 if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2866 // Given the exact block placement we chose, we may actually not _need_ to
2867 // be able to edit PrevBB's terminator sequence, but not being _able_ to
2868 // do that at this point is a bug.
2869 assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2870 !PrevBB->canFallThrough()) &&
2871 "Unexpected block with un-analyzable fallthrough!");
2872 Cond.clear();
2873 TBB = FBB = nullptr;
2874 }
2875#endif
2876
2877 // The "PrevBB" is not yet updated to reflect current code layout, so,
2878 // o. it may fall-through to a block without explicit "goto" instruction
2879 // before layout, and no longer fall-through it after layout; or
2880 // o. just opposite.
2881 //
2882 // analyzeBranch() may return erroneous value for FBB when these two
2883 // situations take place. For the first scenario FBB is mistakenly set NULL;
2884 // for the 2nd scenario, the FBB, which is expected to be NULL, is
2885 // mistakenly pointing to "*BI".
2886 // Thus, if the future change needs to use FBB before the layout is set, it
2887 // has to correct FBB first by using the code similar to the following:
2888 //
2889 // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2890 // PrevBB->updateTerminator();
2891 // Cond.clear();
2892 // TBB = FBB = nullptr;
2893 // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2894 // // FIXME: This should never take place.
2895 // TBB = FBB = nullptr;
2896 // }
2897 // }
2898 if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2899 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2900 }
2901 }
2902
2903 // Fixup the last block.
2904 Cond.clear();
2905 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2906 if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
2907 MachineBasicBlock *PrevBB = &F->back();
2908 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2909 }
2910
2911 BlockWorkList.clear();
2912 EHPadWorkList.clear();
2913}
2914
2915void MachineBlockPlacement::optimizeBranches() {
2916 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2917 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2918
2919 // Now that all the basic blocks in the chain have the proper layout,
2920 // make a final call to analyzeBranch with AllowModify set.
2921 // Indeed, the target may be able to optimize the branches in a way we
2922 // cannot because all branches may not be analyzable.
2923 // E.g., the target may be able to remove an unconditional branch to
2924 // a fallthrough when it occurs after predicated terminators.
2925 for (MachineBasicBlock *ChainBB : FunctionChain) {
2926 Cond.clear();
2927 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2928 if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
2929 // If PrevBB has a two-way branch, try to re-order the branches
2930 // such that we branch to the successor with higher probability first.
2931 if (TBB && !Cond.empty() && FBB &&
2932 MBPI->getEdgeProbability(ChainBB, FBB) >
2933 MBPI->getEdgeProbability(ChainBB, TBB) &&
2935 LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2936 << getBlockName(ChainBB) << "\n");
2937 LLVM_DEBUG(dbgs() << " Edge probability: "
2938 << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
2939 << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
2940 DebugLoc dl; // FIXME: this is nowhere
2941 TII->removeBranch(*ChainBB);
2942 TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
2943 }
2944 }
2945 }
2946}
2947
2948void MachineBlockPlacement::alignBlocks() {
2949 // Walk through the backedges of the function now that we have fully laid out
2950 // the basic blocks and align the destination of each backedge. We don't rely
2951 // exclusively on the loop info here so that we can align backedges in
2952 // unnatural CFGs and backedges that were introduced purely because of the
2953 // loop rotations done during this layout pass.
2954 if (F->getFunction().hasMinSize() ||
2955 (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
2956 return;
2957 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2958 if (FunctionChain.begin() == FunctionChain.end())
2959 return; // Empty chain.
2960
2961 const BranchProbability ColdProb(1, 5); // 20%
2962 BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
2963 BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
2964 for (MachineBasicBlock *ChainBB : FunctionChain) {
2965 if (ChainBB == *FunctionChain.begin())
2966 continue;
2967
2968 // Don't align non-looping basic blocks. These are unlikely to execute
2969 // enough times to matter in practice. Note that we'll still handle
2970 // unnatural CFGs inside of a natural outer loop (the common case) and
2971 // rotated loops.
2972 MachineLoop *L = MLI->getLoopFor(ChainBB);
2973 if (!L)
2974 continue;
2975
2976 const Align TLIAlign = TLI->getPrefLoopAlignment(L);
2977 unsigned MDAlign = 1;
2978 MDNode *LoopID = L->getLoopID();
2979 if (LoopID) {
2980 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
2981 MDNode *MD = dyn_cast<MDNode>(MDO);
2982 if (MD == nullptr)
2983 continue;
2984 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
2985 if (S == nullptr)
2986 continue;
2987 if (S->getString() == "llvm.loop.align") {
2988 assert(MD->getNumOperands() == 2 &&
2989 "per-loop align metadata should have two operands.");
2990 MDAlign =
2991 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
2992 assert(MDAlign >= 1 && "per-loop align value must be positive.");
2993 }
2994 }
2995 }
2996
2997 // Use max of the TLIAlign and MDAlign
2998 const Align LoopAlign = std::max(TLIAlign, Align(MDAlign));
2999 if (LoopAlign == 1)
3000 continue; // Don't care about loop alignment.
3001
3002 // If the block is cold relative to the function entry don't waste space
3003 // aligning it.
3004 BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
3005 if (Freq < WeightedEntryFreq)
3006 continue;
3007
3008 // If the block is cold relative to its loop header, don't align it
3009 // regardless of what edges into the block exist.
3010 MachineBasicBlock *LoopHeader = L->getHeader();
3011 BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
3012 if (Freq < (LoopHeaderFreq * ColdProb))
3013 continue;
3014
3015 // If the global profiles indicates so, don't align it.
3016 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
3017 !TLI->alignLoopsWithOptSize())
3018 continue;
3019
3020 // Check for the existence of a non-layout predecessor which would benefit
3021 // from aligning this block.
3022 MachineBasicBlock *LayoutPred =
3023 &*std::prev(MachineFunction::iterator(ChainBB));
3024
3025 auto DetermineMaxAlignmentPadding = [&]() {
3026 // Set the maximum bytes allowed to be emitted for alignment.
3027 unsigned MaxBytes;
3028 if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
3030 else
3031 MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
3032 ChainBB->setMaxBytesForAlignment(MaxBytes);
3033 };
3034
3035 // Force alignment if all the predecessors are jumps. We already checked
3036 // that the block isn't cold above.
3037 if (!LayoutPred->isSuccessor(ChainBB)) {
3038 ChainBB->setAlignment(LoopAlign);
3039 DetermineMaxAlignmentPadding();
3040 continue;
3041 }
3042
3043 // Align this block if the layout predecessor's edge into this block is
3044 // cold relative to the block. When this is true, other predecessors make up
3045 // all of the hot entries into the block and thus alignment is likely to be
3046 // important.
3047 BranchProbability LayoutProb =
3048 MBPI->getEdgeProbability(LayoutPred, ChainBB);
3049 BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
3050 if (LayoutEdgeFreq <= (Freq * ColdProb)) {
3051 ChainBB->setAlignment(LoopAlign);
3052 DetermineMaxAlignmentPadding();
3053 }
3054 }
3055}
3056
3057/// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
3058/// it was duplicated into its chain predecessor and removed.
3059/// \p BB - Basic block that may be duplicated.
3060///
3061/// \p LPred - Chosen layout predecessor of \p BB.
3062/// Updated to be the chain end if LPred is removed.
3063/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3064/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3065/// Used to identify which blocks to update predecessor
3066/// counts.
3067/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3068/// chosen in the given order due to unnatural CFG
3069/// only needed if \p BB is removed and
3070/// \p PrevUnplacedBlockIt pointed to \p BB.
3071/// @return true if \p BB was removed.
3072bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
3074 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
3075 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3076 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt) {
3077 bool Removed, DuplicatedToLPred;
3078 bool DuplicatedToOriginalLPred;
3079 Removed = maybeTailDuplicateBlock(
3080 BB, LPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3081 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3082 if (!Removed)
3083 return false;
3084 DuplicatedToOriginalLPred = DuplicatedToLPred;
3085 // Iteratively try to duplicate again. It can happen that a block that is
3086 // duplicated into is still small enough to be duplicated again.
3087 // No need to call markBlockSuccessors in this case, as the blocks being
3088 // duplicated from here on are already scheduled.
3089 while (DuplicatedToLPred && Removed) {
3090 MachineBasicBlock *DupBB, *DupPred;
3091 // The removal callback causes Chain.end() to be updated when a block is
3092 // removed. On the first pass through the loop, the chain end should be the
3093 // same as it was on function entry. On subsequent passes, because we are
3094 // duplicating the block at the end of the chain, if it is removed the
3095 // chain will have shrunk by one block.
3096 BlockChain::iterator ChainEnd = Chain.end();
3097 DupBB = *(--ChainEnd);
3098 // Now try to duplicate again.
3099 if (ChainEnd == Chain.begin())
3100 break;
3101 DupPred = *std::prev(ChainEnd);
3102 Removed = maybeTailDuplicateBlock(
3103 DupBB, DupPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3104 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3105 }
3106 // If BB was duplicated into LPred, it is now scheduled. But because it was
3107 // removed, markChainSuccessors won't be called for its chain. Instead we
3108 // call markBlockSuccessors for LPred to achieve the same effect. This must go
3109 // at the end because repeating the tail duplication can increase the number
3110 // of unscheduled predecessors.
3111 LPred = *std::prev(Chain.end());
3112 if (DuplicatedToOriginalLPred)
3113 markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
3114 return true;
3115}
3116
3117/// Tail duplicate \p BB into (some) predecessors if profitable.
3118/// \p BB - Basic block that may be duplicated
3119/// \p LPred - Chosen layout predecessor of \p BB
3120/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3121/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3122/// Used to identify which blocks to update predecessor
3123/// counts.
3124/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3125/// chosen in the given order due to unnatural CFG
3126/// only needed if \p BB is removed and
3127/// \p PrevUnplacedBlockIt pointed to \p BB.
3128/// \p DuplicatedToLPred - True if the block was duplicated into LPred.
3129/// \return - True if the block was duplicated into all preds and removed.
3130bool MachineBlockPlacement::maybeTailDuplicateBlock(
3131 MachineBasicBlock *BB, MachineBasicBlock *LPred, BlockChain &Chain,
3132 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3133 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
3134 bool &DuplicatedToLPred) {
3135 DuplicatedToLPred = false;
3136 if (!shouldTailDuplicate(BB))
3137 return false;
3138
3139 LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
3140 << "\n");
3141
3142 // This has to be a callback because none of it can be done after
3143 // BB is deleted.
3144 bool Removed = false;
3145 auto RemovalCallback =
3146 [&](MachineBasicBlock *RemBB) {
3147 // Signal to outer function
3148 Removed = true;
3149
3150 // Conservative default.
3151 bool InWorkList = true;
3152 // Remove from the Chain and Chain Map
3153 if (BlockToChain.count(RemBB)) {
3154 BlockChain *Chain = BlockToChain[RemBB];
3155 InWorkList = Chain->UnscheduledPredecessors == 0;
3156 Chain->remove(RemBB);
3157 BlockToChain.erase(RemBB);
3158 }
3159
3160 // Handle the unplaced block iterator
3161 if (&(*PrevUnplacedBlockIt) == RemBB) {
3162 PrevUnplacedBlockIt++;
3163 }
3164
3165 // Handle the Work Lists
3166 if (InWorkList) {
3167 SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
3168 if (RemBB->isEHPad())
3169 RemoveList = EHPadWorkList;
3170 llvm::erase(RemoveList, RemBB);
3171 }
3172
3173 // Handle the filter set
3174 if (BlockFilter) {
3175 auto It = llvm::find(*BlockFilter, RemBB);
3176 // Erase RemBB from BlockFilter, and keep PrevUnplacedBlockInFilterIt
3177 // pointing to the same element as before.
3178 if (It != BlockFilter->end()) {
3179 if (It < PrevUnplacedBlockInFilterIt) {
3180 const MachineBasicBlock *PrevBB = *PrevUnplacedBlockInFilterIt;
3181 // BlockFilter is a SmallVector so all elements after RemBB are
3182 // shifted to the front by 1 after its deletion.
3183 auto Distance = PrevUnplacedBlockInFilterIt - It - 1;
3184 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It) + Distance;
3185 assert(*PrevUnplacedBlockInFilterIt == PrevBB);
3186 (void)PrevBB;
3187 } else if (It == PrevUnplacedBlockInFilterIt)
3188 // The block pointed by PrevUnplacedBlockInFilterIt is erased, we
3189 // have to set it to the next element.
3190 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It);
3191 else
3192 BlockFilter->erase(It);
3193 }
3194 }
3195
3196 // Remove the block from loop info.
3197 MLI->removeBlock(RemBB);
3198 if (RemBB == PreferredLoopExit)
3199 PreferredLoopExit = nullptr;
3200
3201 LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
3202 << getBlockName(RemBB) << "\n");
3203 };
3204 auto RemovalCallbackRef =
3206
3208 bool IsSimple = TailDup.isSimpleBB(BB);
3210 SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
3211 if (F->getFunction().hasProfileData()) {
3212 // We can do partial duplication with precise profile information.
3213 findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
3214 if (CandidatePreds.size() == 0)
3215 return false;
3216 if (CandidatePreds.size() < BB->pred_size())
3217 CandidatePtr = &CandidatePreds;
3218 }
3219 TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
3220 &RemovalCallbackRef, CandidatePtr);
3221
3222 // Update UnscheduledPredecessors to reflect tail-duplication.
3223 DuplicatedToLPred = false;
3224 for (MachineBasicBlock *Pred : DuplicatedPreds) {
3225 // We're only looking for unscheduled predecessors that match the filter.
3226 BlockChain* PredChain = BlockToChain[Pred];
3227 if (Pred == LPred)
3228 DuplicatedToLPred = true;
3229 if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
3230 || PredChain == &Chain)
3231 continue;
3232 for (MachineBasicBlock *NewSucc : Pred->successors()) {
3233 if (BlockFilter && !BlockFilter->count(NewSucc))
3234 continue;
3235 BlockChain *NewChain = BlockToChain[NewSucc];
3236 if (NewChain != &Chain && NewChain != PredChain)
3237 NewChain->UnscheduledPredecessors++;
3238 }
3239 }
3240 return Removed;
3241}
3242
3243// Count the number of actual machine instructions.
3245 uint64_t InstrCount = 0;
3246 for (MachineInstr &MI : *MBB) {
3247 if (!MI.isPHI() && !MI.isMetaInstruction())
3248 InstrCount += 1;
3249 }
3250 return InstrCount;
3251}
3252
3253// The size cost of duplication is the instruction size of the duplicated block.
3254// So we should scale the threshold accordingly. But the instruction size is not
3255// available on all targets, so we use the number of instructions instead.
3256BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3257 return BlockFrequency(DupThreshold.getFrequency() * countMBBInstruction(BB));
3258}
3259
3260// Returns true if BB is Pred's best successor.
3261bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3262 MachineBasicBlock *Pred,
3263 BlockFilterSet *BlockFilter) {
3264 if (BB == Pred)
3265 return false;
3266 if (BlockFilter && !BlockFilter->count(Pred))
3267 return false;
3268 BlockChain *PredChain = BlockToChain[Pred];
3269 if (PredChain && (Pred != *std::prev(PredChain->end())))
3270 return false;
3271
3272 // Find the successor with largest probability excluding BB.
3274 for (MachineBasicBlock *Succ : Pred->successors())
3275 if (Succ != BB) {
3276 if (BlockFilter && !BlockFilter->count(Succ))
3277 continue;
3278 BlockChain *SuccChain = BlockToChain[Succ];
3279 if (SuccChain && (Succ != *SuccChain->begin()))
3280 continue;
3281 BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
3282 if (SuccProb > BestProb)
3283 BestProb = SuccProb;
3284 }
3285
3286 BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
3287 if (BBProb <= BestProb)
3288 return false;
3289
3290 // Compute the number of reduced taken branches if Pred falls through to BB
3291 // instead of another successor. Then compare it with threshold.
3292 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3293 BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3294 return Gain > scaleThreshold(BB);
3295}
3296
3297// Find out the predecessors of BB and BB can be beneficially duplicated into
3298// them.
3299void MachineBlockPlacement::findDuplicateCandidates(
3302 BlockFilterSet *BlockFilter) {
3303 MachineBasicBlock *Fallthrough = nullptr;
3304 BranchProbability DefaultBranchProb = BranchProbability::getZero();
3305 BlockFrequency BBDupThreshold(scaleThreshold(BB));
3308
3309 // Sort for highest frequency.
3310 auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3311 return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
3312 };
3313 auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3314 return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
3315 };
3316 llvm::stable_sort(Succs, CmpSucc);
3317 llvm::stable_sort(Preds, CmpPred);
3318
3319 auto SuccIt = Succs.begin();
3320 if (SuccIt != Succs.end()) {
3321 DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
3322 }
3323
3324 // For each predecessors of BB, compute the benefit of duplicating BB,
3325 // if it is larger than the threshold, add it into Candidates.
3326 //
3327 // If we have following control flow.
3328 //
3329 // PB1 PB2 PB3 PB4
3330 // \ | / /\
3331 // \ | / / \
3332 // \ |/ / \
3333 // BB----/ OB
3334 // /\
3335 // / \
3336 // SB1 SB2
3337 //
3338 // And it can be partially duplicated as
3339 //
3340 // PB2+BB
3341 // | PB1 PB3 PB4
3342 // | | / /\
3343 // | | / / \
3344 // | |/ / \
3345 // | BB----/ OB
3346 // |\ /|
3347 // | X |
3348 // |/ \|
3349 // SB2 SB1
3350 //
3351 // The benefit of duplicating into a predecessor is defined as
3352 // Orig_taken_branch - Duplicated_taken_branch
3353 //
3354 // The Orig_taken_branch is computed with the assumption that predecessor
3355 // jumps to BB and the most possible successor is laid out after BB.
3356 //
3357 // The Duplicated_taken_branch is computed with the assumption that BB is
3358 // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3359 // SB2 for PB2 in our case). If there is no available successor, the combined
3360 // block jumps to all BB's successor, like PB3 in this example.
3361 //
3362 // If a predecessor has multiple successors, so BB can't be duplicated into
3363 // it. But it can beneficially fall through to BB, and duplicate BB into other
3364 // predecessors.
3365 for (MachineBasicBlock *Pred : Preds) {
3366 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3367
3368 if (!TailDup.canTailDuplicate(BB, Pred)) {
3369 // BB can't be duplicated into Pred, but it is possible to be layout
3370 // below Pred.
3371 if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3372 Fallthrough = Pred;
3373 if (SuccIt != Succs.end())
3374 SuccIt++;
3375 }
3376 continue;
3377 }
3378
3379 BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3380 BlockFrequency DupCost;
3381 if (SuccIt == Succs.end()) {
3382 // Jump to all successors;
3383 if (Succs.size() > 0)
3384 DupCost += PredFreq;
3385 } else {
3386 // Fallthrough to *SuccIt, jump to all other successors;
3387 DupCost += PredFreq;
3388 DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
3389 }
3390
3391 assert(OrigCost >= DupCost);
3392 OrigCost -= DupCost;
3393 if (OrigCost > BBDupThreshold) {
3394 Candidates.push_back(Pred);
3395 if (SuccIt != Succs.end())
3396 SuccIt++;
3397 }
3398 }
3399
3400 // No predecessors can optimally fallthrough to BB.
3401 // So we can change one duplication into fallthrough.
3402 if (!Fallthrough) {
3403 if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
3404 Candidates[0] = Candidates.back();
3405 Candidates.pop_back();
3406 }
3407 }
3408}
3409
3410void MachineBlockPlacement::initDupThreshold() {
3411 DupThreshold = BlockFrequency(0);
3412 if (!F->getFunction().hasProfileData())
3413 return;
3414
3415 // We prefer to use prifile count.
3416 uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
3417 if (HotThreshold != UINT64_MAX) {
3418 UseProfileCount = true;
3419 DupThreshold =
3420 BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100);
3421 return;
3422 }
3423
3424 // Profile count is not available, we can use block frequency instead.
3425 BlockFrequency MaxFreq = BlockFrequency(0);
3426 for (MachineBasicBlock &MBB : *F) {
3427 BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
3428 if (Freq > MaxFreq)
3429 MaxFreq = Freq;
3430 }
3431
3432 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
3433 DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
3434 UseProfileCount = false;
3435}
3436
3437bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
3438 if (skipFunction(MF.getFunction()))
3439 return false;
3440
3441 // Check for single-block functions and skip them.
3442 if (std::next(MF.begin()) == MF.end())
3443 return false;
3444
3445 F = &MF;
3446 MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
3447 MBFI = std::make_unique<MBFIWrapper>(
3448 getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI());
3449 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3450 TII = MF.getSubtarget().getInstrInfo();
3451 TLI = MF.getSubtarget().getTargetLowering();
3452 MPDT = nullptr;
3453 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
3454
3455 initDupThreshold();
3456
3457 // Initialize PreferredLoopExit to nullptr here since it may never be set if
3458 // there are no MachineLoops.
3459 PreferredLoopExit = nullptr;
3460
3461 assert(BlockToChain.empty() &&
3462 "BlockToChain map should be empty before starting placement.");
3463 assert(ComputedEdges.empty() &&
3464 "Computed Edge map should be empty before starting placement.");
3465
3466 unsigned TailDupSize = TailDupPlacementThreshold;
3467 // If only the aggressive threshold is explicitly set, use it.
3468 if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3469 TailDupPlacementThreshold.getNumOccurrences() == 0)
3471
3472 TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
3473 // For aggressive optimization, we can adjust some thresholds to be less
3474 // conservative.
3475 if (PassConfig->getOptLevel() >= CodeGenOptLevel::Aggressive) {
3476 // At O3 we should be more willing to copy blocks for tail duplication. This
3477 // increases size pressure, so we only do it at O3
3478 // Do this unless only the regular threshold is explicitly set.
3479 if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3480 TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3482 }
3483
3484 // If there's no threshold provided through options, query the target
3485 // information for a threshold instead.
3486 if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
3487 (PassConfig->getOptLevel() < CodeGenOptLevel::Aggressive ||
3488 TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
3489 TailDupSize = TII->getTailDuplicateSize(PassConfig->getOptLevel());
3490
3491 if (allowTailDupPlacement()) {
3492 MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3493 bool OptForSize = MF.getFunction().hasOptSize() ||
3494 llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
3495 if (OptForSize)
3496 TailDupSize = 1;
3497 bool PreRegAlloc = false;
3498 TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
3499 /* LayoutMode */ true, TailDupSize);
3500 precomputeTriangleChains();
3501 }
3502
3503 buildCFGChains();
3504
3505 // Changing the layout can create new tail merging opportunities.
3506 // TailMerge can create jump into if branches that make CFG irreducible for
3507 // HW that requires structured CFG.
3508 bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3509 PassConfig->getEnableTailMerge() &&
3511 // No tail merging opportunities if the block number is less than four.
3512 if (MF.size() > 3 && EnableTailMerge) {
3513 unsigned TailMergeSize = TailDupSize + 1;
3514 BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
3515 *MBFI, *MBPI, PSI, TailMergeSize);
3516
3517 if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
3518 /*AfterPlacement=*/true)) {
3519 // Redo the layout if tail merging creates/removes/moves blocks.
3520 BlockToChain.clear();
3521 ComputedEdges.clear();
3522 // Must redo the post-dominator tree if blocks were changed.
3523 if (MPDT)
3524 MPDT->recalculate(MF);
3525 ChainAllocator.DestroyAll();
3526 buildCFGChains();
3527 }
3528 }
3529
3530 // Apply a post-processing optimizing block placement.
3531 if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
3534 // Find a new placement and modify the layout of the blocks in the function.
3535 applyExtTsp();
3536
3537 // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
3538 createCFGChainExtTsp();
3539 }
3540
3541 optimizeBranches();
3542 alignBlocks();
3543
3544 BlockToChain.clear();
3545 ComputedEdges.clear();
3546 ChainAllocator.DestroyAll();
3547
3548 bool HasMaxBytesOverride =
3549 MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
3550
3551 if (AlignAllBlock)
3552 // Align all of the blocks in the function to a specific alignment.
3553 for (MachineBasicBlock &MBB : MF) {
3554 if (HasMaxBytesOverride)
3557 else
3559 }
3560 else if (AlignAllNonFallThruBlocks) {
3561 // Align all of the blocks that have no fall-through predecessors to a
3562 // specific alignment.
3563 for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
3564 auto LayoutPred = std::prev(MBI);
3565 if (!LayoutPred->isSuccessor(&*MBI)) {
3566 if (HasMaxBytesOverride)
3567 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
3569 else
3570 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
3571 }
3572 }
3573 }
3575 (ViewBlockFreqFuncName.empty() ||
3576 F->getFunction().getName() == ViewBlockFreqFuncName)) {
3578 MF.RenumberBlocks();
3579 MBFI->view("MBP." + MF.getName(), false);
3580 }
3581
3582 // We always return true as we have no way to track whether the final order
3583 // differs from the original order.
3584 return true;
3585}
3586
3587void MachineBlockPlacement::applyExtTsp() {
3588 // Prepare data; blocks are indexed by their index in the current ordering.
3590 BlockIndex.reserve(F->size());
3591 std::vector<const MachineBasicBlock *> CurrentBlockOrder;
3592 CurrentBlockOrder.reserve(F->size());
3593 size_t NumBlocks = 0;
3594 for (const MachineBasicBlock &MBB : *F) {
3595 BlockIndex[&MBB] = NumBlocks++;
3596 CurrentBlockOrder.push_back(&MBB);
3597 }
3598
3599 auto BlockSizes = std::vector<uint64_t>(F->size());
3600 auto BlockCounts = std::vector<uint64_t>(F->size());
3601 std::vector<codelayout::EdgeCount> JumpCounts;
3602 for (MachineBasicBlock &MBB : *F) {
3603 // Getting the block frequency.
3604 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3605 BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency();
3606 // Getting the block size:
3607 // - approximate the size of an instruction by 4 bytes, and
3608 // - ignore debug instructions.
3609 // Note: getting the exact size of each block is target-dependent and can be
3610 // done by extending the interface of MCCodeEmitter. Experimentally we do
3611 // not see a perf improvement with the exact block sizes.
3612 auto NonDbgInsts =
3614 int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
3615 BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3616 // Getting jump frequencies.
3617 for (MachineBasicBlock *Succ : MBB.successors()) {
3618 auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3619 BlockFrequency JumpFreq = BlockFreq * EP;
3620 JumpCounts.push_back(
3621 {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
3622 }
3623 }
3624
3625 LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
3626 << " with profile = " << F->getFunction().hasProfileData()
3627 << " (" << F->getName().str() << ")"
3628 << "\n");
3629 LLVM_DEBUG(
3630 dbgs() << format(" original layout score: %0.2f\n",
3631 calcExtTspScore(BlockSizes, BlockCounts, JumpCounts)));
3632
3633 // Run the layout algorithm.
3634 auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
3635 std::vector<const MachineBasicBlock *> NewBlockOrder;
3636 NewBlockOrder.reserve(F->size());
3637 for (uint64_t Node : NewOrder) {
3638 NewBlockOrder.push_back(CurrentBlockOrder[Node]);
3639 }
3640 LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n",
3641 calcExtTspScore(NewOrder, BlockSizes, BlockCounts,
3642 JumpCounts)));
3643
3644 // Assign new block order.
3645 assignBlockOrder(NewBlockOrder);
3646}
3647
3648void MachineBlockPlacement::assignBlockOrder(
3649 const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
3650 assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
3651 F->RenumberBlocks();
3652 // At this point, we possibly removed blocks from the function, so we can't
3653 // renumber the domtree. At this point, we don't need it anymore, though.
3654 // TODO: move this to the point where the dominator tree is actually
3655 // invalidated (i.e., where blocks are removed without updating the domtree).
3656 MPDT = nullptr;
3657
3658 bool HasChanges = false;
3659 for (size_t I = 0; I < NewBlockOrder.size(); I++) {
3660 if (NewBlockOrder[I] != F->getBlockNumbered(I)) {
3661 HasChanges = true;
3662 break;
3663 }
3664 }
3665 // Stop early if the new block order is identical to the existing one.
3666 if (!HasChanges)
3667 return;
3668
3669 SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs());
3670 for (auto &MBB : *F) {
3671 PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
3672 }
3673
3674 // Sort basic blocks in the function according to the computed order.
3676 for (const MachineBasicBlock *MBB : NewBlockOrder) {
3677 NewIndex[MBB] = NewIndex.size();
3678 }
3679 F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) {
3680 return NewIndex[&L] < NewIndex[&R];
3681 });
3682
3683 // Update basic block branches by inserting explicit fallthrough branches
3684 // when required and re-optimize branches when possible.
3685 const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
3687 for (auto &MBB : *F) {
3688 MachineFunction::iterator NextMBB = std::next(MBB.getIterator());
3690 auto *FTMBB = PrevFallThroughs[MBB.getNumber()];
3691 // If this block had a fallthrough before we need an explicit unconditional
3692 // branch to that block if the fallthrough block is not adjacent to the
3693 // block in the new order.
3694 if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) {
3695 TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
3696 }
3697
3698 // It might be possible to optimize branches by flipping the condition.
3699 Cond.clear();
3700 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
3701 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3702 continue;
3703 MBB.updateTerminator(FTMBB);
3704 }
3705
3706#ifndef NDEBUG
3707 // Make sure we correctly constructed all branches.
3708 F->verify(this, "After optimized block reordering");
3709#endif
3710}
3711
3712void MachineBlockPlacement::createCFGChainExtTsp() {
3713 BlockToChain.clear();
3714 ComputedEdges.clear();
3715 ChainAllocator.DestroyAll();
3716
3717 MachineBasicBlock *HeadBB = &F->front();
3718 BlockChain *FunctionChain =
3719 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB);
3720
3721 for (MachineBasicBlock &MBB : *F) {
3722 if (HeadBB == &MBB)
3723 continue; // Ignore head of the chain
3724 FunctionChain->merge(&MBB, nullptr);
3725 }
3726}
3727
3728namespace {
3729
3730/// A pass to compute block placement statistics.
3731///
3732/// A separate pass to compute interesting statistics for evaluating block
3733/// placement. This is separate from the actual placement pass so that they can
3734/// be computed in the absence of any placement transformations or when using
3735/// alternative placement strategies.
3736class MachineBlockPlacementStats : public MachineFunctionPass {
3737 /// A handle to the branch probability pass.
3738 const MachineBranchProbabilityInfo *MBPI;
3739
3740 /// A handle to the function-wide block frequency pass.
3741 const MachineBlockFrequencyInfo *MBFI;
3742
3743public:
3744 static char ID; // Pass identification, replacement for typeid
3745
3746 MachineBlockPlacementStats() : MachineFunctionPass(ID) {
3748 }
3749
3750 bool runOnMachineFunction(MachineFunction &F) override;
3751
3752 void getAnalysisUsage(AnalysisUsage &AU) const override {
3755 AU.setPreservesAll();
3757 }
3758};
3759
3760} // end anonymous namespace
3761
3762char MachineBlockPlacementStats::ID = 0;
3763
3764char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStats::ID;
3765
3766INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
3767 "Basic Block Placement Stats", false, false)
3770INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
3772
3773bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
3774 // Check for single-block functions and skip them.
3775 if (std::next(F.begin()) == F.end())
3776 return false;
3777
3778 if (!isFunctionInPrintList(F.getName()))
3779 return false;
3780
3781 MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
3782 MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
3783
3784 for (MachineBasicBlock &MBB : F) {
3785 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3786 Statistic &NumBranches =
3787 (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3788 Statistic &BranchTakenFreq =
3789 (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3790 for (MachineBasicBlock *Succ : MBB.successors()) {
3791 // Skip if this successor is a fallthrough.
3792 if (MBB.isLayoutSuccessor(Succ))
3793 continue;
3794
3795 BlockFrequency EdgeFreq =
3796 BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3797 ++NumBranches;
3798 BranchTakenFreq += EdgeFreq.getFrequency();
3799 }
3800 }
3801
3802 return false;
3803}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
MachineBasicBlock & MBB
This file defines the BumpPtrAllocator interface.
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Declares methods and data structures for code layout algorithms.
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:533
static unsigned InstrCount
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
block placement Basic Block Placement Stats
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
Branch Probability Basic Block Placement
static cl::opt< unsigned > ExtTspBlockPlacementMaxBlocks("ext-tsp-block-placement-max-blocks", cl::desc("Maximum number of basic blocks in a function to run ext-TSP " "block placement."), cl::init(UINT_MAX), cl::Hidden)
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, BlockFrequency EntryFreq)
Compare 2 BlockFrequency's with a small penalty for A.
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > MaxBytesForAlignmentOverride("max-bytes-for-alignment", cl::desc("Forces the maximum bytes allowed to be emitted when padding for " "alignment"), cl::init(0), cl::Hidden)
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static cl::opt< unsigned > TailDupProfilePercentThreshold("tail-dup-profile-percent-threshold", cl::desc("If profile count information is used in tail duplication cost " "model, the gained fall through number from tail duplication " "should be at least this percent of hot count."), cl::init(50), cl::Hidden)
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock * > &Successors)
Check if BB has exactly the successors in Successors.
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden)
static uint64_t countMBBInstruction(MachineBasicBlock *MBB)
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
static cl::opt< bool > RenumberBlocksBeforeView("renumber-blocks-before-view", cl::desc("If true, basic blocks are re-numbered before MBP layout is printed " "into a dot graph. Only used when a function is being printed."), cl::init(false), cl::Hidden)
#define DEBUG_TYPE
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
unify loop Fixup each natural loop to have a single exit block
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
static BlockFrequency max()
Returns the maximum possible frequency, the saturation value.
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
static BranchProbability getOne()
uint32_t getNumerator() const
BranchProbability getCompl() const
static BranchProbability getZero()
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:336
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:146
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:705
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition: Function.h:333
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
Remove the branching code at the end of the specific MBB.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
Reverses the branch condition of the specified condition list, returning false on success and true if...
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
Insert branch code into the end of the specified MachineBasicBlock.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Metadata node.
Definition: Metadata.h:1069
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1428
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:891
A single uniqued string.
Definition: Metadata.h:720
StringRef getString() const
Definition: Metadata.cpp:616
unsigned pred_size() const
bool isEHPad() const
Returns true if the block is a landing pad.
instr_iterator instr_begin()
MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
void updateTerminator(MachineBasicBlock *PreviousLayoutSuccessor)
Update the terminator instructions in block to account for changes to block layout which may have bee...
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
unsigned succ_size() const
void setAlignment(Align A)
Set alignment of the basic block.
bool isEntryBlock() const
Returns true if this is the entry block of the function.
succ_reverse_iterator succ_rbegin()
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
DebugLoc findBranchDebugLoc()
Find and return the merged DebugLoc of the branch instructions of the block.
iterator_range< succ_iterator > successors()
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
iterator_range< pred_iterator > predecessors()
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
unsigned size() const
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Representation of each machine instruction.
Definition: MachineInstr.h:69
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
uint64_t getOrCompHotCountThreshold() const
Returns HotCountThreshold if set.
typename vector_type::const_iterator iterator
Definition: SetVector.h:69
size_type size() const
Definition: SmallPtrSet.h:95
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:346
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:435
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:367
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:502
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:591
typename SuperClass::iterator iterator
Definition: SmallVector.h:590
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:389
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition: Allocator.h:439
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition: Allocator.h:410
Utility class to perform tail duplication.
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, MBFIWrapper *MBFI, ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock * > *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr, SmallVectorImpl< MachineBasicBlock * > *CandidatePtr=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
TargetInstrInfo - Interface to description of machine instruction set.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
bool requiresStructuredCFG() const
Target-Independent Code Generator Pass Configuration Options.
bool getEnableTailMerge() const
CodeGenOptLevel getOptLevel() const
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetLowering * getTargetLowering() const
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:132
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
#define UINT64_MAX
Definition: DataTypes.h:77
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
double calcExtTspScore(ArrayRef< uint64_t > Order, ArrayRef< uint64_t > NodeSizes, ArrayRef< uint64_t > NodeCounts, ArrayRef< EdgeCount > EdgeCounts)
Estimate the "quality" of a given node order in CFG.
std::vector< uint64_t > computeExtTspLayout(ArrayRef< uint64_t > NodeSizes, ArrayRef< uint64_t > NodeCounts, ArrayRef< EdgeCount > EdgeCounts)
Find a layout of nodes (basic blocks) of a given CFG optimizing jump locality and thus processor I-ca...
void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition: Path.cpp:457
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void stable_sort(R &&Range)
Definition: STLExtras.h:2020
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
cl::opt< bool > ApplyExtTspWithoutProfile
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
void initializeMachineBlockPlacementStatsPass(PassRegistry &)
cl::opt< unsigned > ProfileLikelyProb
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2090
cl::opt< std::string > ViewBlockFreqFuncName("view-bfi-func-name", cl::Hidden, cl::desc("The option to specify " "the name of the function " "whose CFG will be displayed."))
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
cl::opt< GVDAGType > ViewBlockLayoutWithBFI("view-block-layout-with-bfi", cl::Hidden, cl::desc("Pop up a window to show a dag displaying MBP layout and associated " "block frequencies of the CFG."), cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), clEnumValN(GVDT_Fraction, "fraction", "display a graph using the " "fractional block frequency representation."), clEnumValN(GVDT_Integer, "integer", "display a graph using the raw " "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available.")))
bool isFunctionInPrintList(StringRef FunctionName)
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
cl::opt< unsigned > StaticLikelyProb
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1921
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2082
Printable printBlockFreq(const BlockFrequencyInfo &BFI, BlockFrequency Freq)
Print the block frequency Freq relative to the current functions entry frequency.
char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
cl::opt< bool > EnableExtTspBlockPlacement
char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
void initializeMachineBlockPlacementPass(PassRegistry &)
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39