LLVM 23.0.0git
MachineBlockPlacement.cpp
Go to the documentation of this file.
1//===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements basic block placement transformations using the CFG
10// structure and branch probability estimates.
11//
12// The pass strives to preserve the structure of the CFG (that is, retain
13// a topological ordering of basic blocks) in the absence of a *strong* signal
14// to the contrary from probabilities. However, within the CFG structure, it
15// attempts to choose an ordering which favors placing more likely sequences of
16// blocks adjacent to each other.
17//
18// The algorithm works from the inner-most loop within a function outward, and
19// at each stage walks through the basic blocks, trying to coalesce them into
20// sequential chains where allowed by the CFG (or demanded by heavy
21// probabilities). Finally, it walks the blocks in topological order, and the
22// first time it reaches a chain of basic blocks, it schedules them in the
23// function in-order.
24//
25//===----------------------------------------------------------------------===//
26
28#include "BranchFolding.h"
29#include "llvm/ADT/ArrayRef.h"
30#include "llvm/ADT/DenseMap.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
35#include "llvm/ADT/Statistic.h"
52#include "llvm/IR/DebugLoc.h"
53#include "llvm/IR/Function.h"
54#include "llvm/IR/PrintPasses.h"
56#include "llvm/Pass.h"
63#include "llvm/Support/Debug.h"
67#include <algorithm>
68#include <cassert>
69#include <cstdint>
70#include <iterator>
71#include <memory>
72#include <string>
73#include <tuple>
74#include <utility>
75#include <vector>
76
77using namespace llvm;
78
79#define DEBUG_TYPE "block-placement"
80
81STATISTIC(NumCondBranches, "Number of conditional branches");
82STATISTIC(NumUncondBranches, "Number of unconditional branches");
83STATISTIC(CondBranchTakenFreq,
84 "Potential frequency of taking conditional branches");
85STATISTIC(UncondBranchTakenFreq,
86 "Potential frequency of taking unconditional branches");
87
89 "align-all-blocks",
90 cl::desc("Force the alignment of all blocks in the function in log2 format "
91 "(e.g 4 means align on 16B boundaries)."),
93
95 "align-all-nofallthru-blocks",
96 cl::desc("Force the alignment of all blocks that have no fall-through "
97 "predecessors (i.e. don't add nops that are executed). In log2 "
98 "format (e.g 4 means align on 16B boundaries)."),
100
102 "max-bytes-for-alignment",
103 cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
104 "alignment"),
105 cl::init(0), cl::Hidden);
106
108 "block-placement-predecessor-limit",
109 cl::desc("For blocks with more predecessors, certain layout optimizations"
110 "will be disabled to prevent quadratic compile time."),
111 cl::init(1000), cl::Hidden);
112
113// FIXME: Find a good default for this flag and remove the flag.
115 "block-placement-exit-block-bias",
116 cl::desc("Block frequency percentage a loop exit block needs "
117 "over the original exit to be considered the new exit."),
118 cl::init(0), cl::Hidden);
119
120// Definition:
121// - Outlining: placement of a basic block outside the chain or hot path.
122
124 "loop-to-cold-block-ratio",
125 cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
126 "(frequency of block) is greater than this ratio"),
127 cl::init(5), cl::Hidden);
128
129static cl::opt<bool>
130 ForceLoopColdBlock("force-loop-cold-block",
131 cl::desc("Force outlining cold blocks from loops."),
132 cl::init(false), cl::Hidden);
133
134static cl::opt<bool>
135 PreciseRotationCost("precise-rotation-cost",
136 cl::desc("Model the cost of loop rotation more "
137 "precisely by using profile data."),
138 cl::init(false), cl::Hidden);
139
140static cl::opt<bool>
141 ForcePreciseRotationCost("force-precise-rotation-cost",
142 cl::desc("Force the use of precise cost "
143 "loop rotation strategy."),
144 cl::init(false), cl::Hidden);
145
147 "misfetch-cost",
148 cl::desc("Cost that models the probabilistic risk of an instruction "
149 "misfetch due to a jump comparing to falling through, whose cost "
150 "is zero."),
151 cl::init(1), cl::Hidden);
152
153static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
154 cl::desc("Cost of jump instructions."),
155 cl::init(1), cl::Hidden);
156static cl::opt<bool>
157 TailDupPlacement("tail-dup-placement",
158 cl::desc("Perform tail duplication during placement. "
159 "Creates more fallthrough opportunities in "
160 "outline branches."),
161 cl::init(true), cl::Hidden);
162
163static cl::opt<bool>
164 BranchFoldPlacement("branch-fold-placement",
165 cl::desc("Perform branch folding during placement. "
166 "Reduces code size."),
167 cl::init(true), cl::Hidden);
168
169// Heuristic for tail duplication.
171 "tail-dup-placement-threshold",
172 cl::desc("Instruction cutoff for tail duplication during layout. "
173 "Tail merging during layout is forced to have a threshold "
174 "that won't conflict."),
175 cl::init(2), cl::Hidden);
176
177// Heuristic for aggressive tail duplication.
179 "tail-dup-placement-aggressive-threshold",
180 cl::desc("Instruction cutoff for aggressive tail duplication during "
181 "layout. Used at -O3. Tail merging during layout is forced to "
182 "have a threshold that won't conflict."),
183 cl::init(4), cl::Hidden);
184
185// Heuristic for tail duplication.
187 "tail-dup-placement-penalty",
188 cl::desc(
189 "Cost penalty for blocks that can avoid breaking CFG by copying. "
190 "Copying can increase fallthrough, but it also increases icache "
191 "pressure. This parameter controls the penalty to account for that. "
192 "Percent as integer."),
193 cl::init(2), cl::Hidden);
194
195// Heuristic for tail duplication if profile count is used in cost model.
197 "tail-dup-profile-percent-threshold",
198 cl::desc("If profile count information is used in tail duplication cost "
199 "model, the gained fall through number from tail duplication "
200 "should be at least this percent of hot count."),
201 cl::init(50), cl::Hidden);
202
203// Heuristic for triangle chains.
205 "triangle-chain-count",
206 cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
207 "triangle tail duplication heuristic to kick in. 0 to disable."),
208 cl::init(2), cl::Hidden);
209
210// Use case: When block layout is visualized after MBP pass, the basic blocks
211// are labeled in layout order; meanwhile blocks could be numbered in a
212// different order. It's hard to map between the graph and pass output.
213// With this option on, the basic blocks are renumbered in function layout
214// order. For debugging only.
216 "renumber-blocks-before-view",
217 cl::desc(
218 "If true, basic blocks are re-numbered before MBP layout is printed "
219 "into a dot graph. Only used when a function is being printed."),
220 cl::init(false), cl::Hidden);
221
223 "ext-tsp-block-placement-max-blocks",
224 cl::desc("Maximum number of basic blocks in a function to run ext-TSP "
225 "block placement."),
226 cl::init(UINT_MAX), cl::Hidden);
227
228// Apply the ext-tsp algorithm minimizing the size of a binary.
229static cl::opt<bool>
230 ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden,
231 cl::desc("Use ext-tsp for size-aware block placement."));
232
233namespace llvm {
238
239// Internal option used to control BFI display only after MBP pass.
240// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
241// -view-block-layout-with-bfi=
243
244// Command line option to specify the name of the function for CFG dump
245// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
247} // namespace llvm
248
249namespace {
250
251class BlockChain;
252
253/// Type for our function-wide basic block -> block chain mapping.
254using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
255
256/// A chain of blocks which will be laid out contiguously.
257///
258/// This is the datastructure representing a chain of consecutive blocks that
259/// are profitable to layout together in order to maximize fallthrough
260/// probabilities and code locality. We also can use a block chain to represent
261/// a sequence of basic blocks which have some external (correctness)
262/// requirement for sequential layout.
263///
264/// Chains can be built around a single basic block and can be merged to grow
265/// them. They participate in a block-to-chain mapping, which is updated
266/// automatically as chains are merged together.
267class BlockChain {
268 /// The sequence of blocks belonging to this chain.
269 ///
270 /// This is the sequence of blocks for a particular chain. These will be laid
271 /// out in-order within the function.
273
274 /// A handle to the function-wide basic block to block chain mapping.
275 ///
276 /// This is retained in each block chain to simplify the computation of child
277 /// block chains for SCC-formation and iteration. We store the edges to child
278 /// basic blocks, and map them back to their associated chains using this
279 /// structure.
280 BlockToChainMapType &BlockToChain;
281
282public:
283 /// Construct a new BlockChain.
284 ///
285 /// This builds a new block chain representing a single basic block in the
286 /// function. It also registers itself as the chain that block participates
287 /// in with the BlockToChain mapping.
288 BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
289 : Blocks(1, BB), BlockToChain(BlockToChain) {
290 assert(BB && "Cannot create a chain with a null basic block");
291 BlockToChain[BB] = this;
292 }
293
294 /// Iterator over blocks within the chain.
297
298 /// Beginning of blocks within the chain.
299 iterator begin() { return Blocks.begin(); }
300 const_iterator begin() const { return Blocks.begin(); }
301
302 /// End of blocks within the chain.
303 iterator end() { return Blocks.end(); }
304 const_iterator end() const { return Blocks.end(); }
305
306 bool remove(MachineBasicBlock *BB) {
307 for (iterator i = begin(); i != end(); ++i) {
308 if (*i == BB) {
309 Blocks.erase(i);
310 return true;
311 }
312 }
313 return false;
314 }
315
316 /// Merge a block chain into this one.
317 ///
318 /// This routine merges a block chain into this one. It takes care of forming
319 /// a contiguous sequence of basic blocks, updating the edge list, and
320 /// updating the block -> chain mapping. It does not free or tear down the
321 /// old chain, but the old chain's block list is no longer valid.
322 void merge(MachineBasicBlock *BB, BlockChain *Chain) {
323 assert(BB && "Can't merge a null block.");
324 assert(!Blocks.empty() && "Can't merge into an empty chain.");
325
326 // Fast path in case we don't have a chain already.
327 if (!Chain) {
328 assert(!BlockToChain[BB] &&
329 "Passed chain is null, but BB has entry in BlockToChain.");
330 Blocks.push_back(BB);
331 BlockToChain[BB] = this;
332 return;
333 }
334
335 assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
336 assert(Chain->begin() != Chain->end());
337
338 // Update the incoming blocks to point to this chain, and add them to the
339 // chain structure.
340 for (MachineBasicBlock *ChainBB : *Chain) {
341 Blocks.push_back(ChainBB);
342 assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
343 BlockToChain[ChainBB] = this;
344 }
345 }
346
347#ifndef NDEBUG
348 /// Dump the blocks in this chain.
349 LLVM_DUMP_METHOD void dump() {
350 for (MachineBasicBlock *MBB : *this)
351 MBB->dump();
352 }
353#endif // NDEBUG
354
355 /// Count of predecessors of any block within the chain which have not
356 /// yet been scheduled. In general, we will delay scheduling this chain
357 /// until those predecessors are scheduled (or we find a sufficiently good
358 /// reason to override this heuristic.) Note that when forming loop chains,
359 /// blocks outside the loop are ignored and treated as if they were already
360 /// scheduled.
361 ///
362 /// Note: This field is reinitialized multiple times - once for each loop,
363 /// and then once for the function as a whole.
364 unsigned UnscheduledPredecessors = 0;
365};
366
367class MachineBlockPlacement {
368 /// A type for a block filter set.
369 using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
370
371 /// Pair struct containing basic block and taildup profitability
372 struct BlockAndTailDupResult {
373 MachineBasicBlock *BB = nullptr;
374 bool ShouldTailDup;
375 };
376
377 /// Triple struct containing edge weight and the edge.
378 struct WeightedEdge {
379 BlockFrequency Weight;
380 MachineBasicBlock *Src = nullptr;
381 MachineBasicBlock *Dest = nullptr;
382 };
383
384 /// work lists of blocks that are ready to be laid out
387
388 /// Edges that have already been computed as optimal.
389 DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges;
390
391 /// Machine Function
392 MachineFunction *F = nullptr;
393
394 /// A handle to the branch probability pass.
395 const MachineBranchProbabilityInfo *MBPI = nullptr;
396
397 /// A handle to the function-wide block frequency pass.
398 std::unique_ptr<MBFIWrapper> MBFI;
399
400 /// A handle to the loop info.
401 MachineLoopInfo *MLI = nullptr;
402
403 /// Preferred loop exit.
404 /// Member variable for convenience. It may be removed by duplication deep
405 /// in the call stack.
406 MachineBasicBlock *PreferredLoopExit = nullptr;
407
408 /// A handle to the target's instruction info.
409 const TargetInstrInfo *TII = nullptr;
410
411 /// A handle to the target's lowering info.
412 const TargetLoweringBase *TLI = nullptr;
413
414 /// A handle to the post dominator tree.
415 MachinePostDominatorTree *MPDT = nullptr;
416
417 ProfileSummaryInfo *PSI = nullptr;
418
419 // Tail merging is also determined based on
420 // whether structured CFG is required.
421 bool AllowTailMerge;
422
423 CodeGenOptLevel OptLevel;
424
425 /// Duplicator used to duplicate tails during placement.
426 ///
427 /// Placement decisions can open up new tail duplication opportunities, but
428 /// since tail duplication affects placement decisions of later blocks, it
429 /// must be done inline.
430 TailDuplicator TailDup;
431
432 /// Partial tail duplication threshold.
433 BlockFrequency DupThreshold;
434
435 unsigned TailDupSize;
436
437 /// True: use block profile count to compute tail duplication cost.
438 /// False: use block frequency to compute tail duplication cost.
439 bool UseProfileCount = false;
440
441 /// Allocator and owner of BlockChain structures.
442 ///
443 /// We build BlockChains lazily while processing the loop structure of
444 /// a function. To reduce malloc traffic, we allocate them using this
445 /// slab-like allocator, and destroy them after the pass completes. An
446 /// important guarantee is that this allocator produces stable pointers to
447 /// the chains.
448 SpecificBumpPtrAllocator<BlockChain> ChainAllocator;
449
450 /// Function wide BasicBlock to BlockChain mapping.
451 ///
452 /// This mapping allows efficiently moving from any given basic block to the
453 /// BlockChain it participates in, if any. We use it to, among other things,
454 /// allow implicitly defining edges between chains as the existing edges
455 /// between basic blocks.
456 DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChain;
457
458#ifndef NDEBUG
459 /// The set of basic blocks that have terminators that cannot be fully
460 /// analyzed. These basic blocks cannot be re-ordered safely by
461 /// MachineBlockPlacement, and we must preserve physical layout of these
462 /// blocks and their successors through the pass.
463 SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
464#endif
465
466 /// Get block profile count or frequency according to UseProfileCount.
467 /// The return value is used to model tail duplication cost.
468 BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
469 if (UseProfileCount) {
470 auto Count = MBFI->getBlockProfileCount(BB);
471 if (Count)
472 return BlockFrequency(*Count);
473 else
474 return BlockFrequency(0);
475 } else
476 return MBFI->getBlockFreq(BB);
477 }
478
479 /// Scale the DupThreshold according to basic block size.
480 BlockFrequency scaleThreshold(MachineBasicBlock *BB);
481 void initTailDupThreshold();
482
483 /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
484 /// if the count goes to 0, add them to the appropriate work list.
485 void markChainSuccessors(const BlockChain &Chain,
486 const MachineBasicBlock *LoopHeaderBB,
487 const BlockFilterSet *BlockFilter = nullptr);
488
489 /// Decrease the UnscheduledPredecessors count for a single block, and
490 /// if the count goes to 0, add them to the appropriate work list.
491 void markBlockSuccessors(const BlockChain &Chain, const MachineBasicBlock *BB,
492 const MachineBasicBlock *LoopHeaderBB,
493 const BlockFilterSet *BlockFilter = nullptr);
494
495 BranchProbability
496 collectViableSuccessors(const MachineBasicBlock *BB, const BlockChain &Chain,
497 const BlockFilterSet *BlockFilter,
498 SmallVector<MachineBasicBlock *, 4> &Successors);
499 bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
500 BlockFilterSet *BlockFilter);
501 void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
502 MachineBasicBlock *BB,
503 BlockFilterSet *BlockFilter);
504 bool repeatedlyTailDuplicateBlock(
505 MachineBasicBlock *BB, MachineBasicBlock *&LPred,
506 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
507 BlockFilterSet *BlockFilter,
508 MachineFunction::iterator &PrevUnplacedBlockIt,
509 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt);
510 bool
511 maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred,
512 BlockChain &Chain, BlockFilterSet *BlockFilter,
513 MachineFunction::iterator &PrevUnplacedBlockIt,
514 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
515 bool &DuplicatedToLPred);
516 bool hasBetterLayoutPredecessor(const MachineBasicBlock *BB,
517 const MachineBasicBlock *Succ,
518 const BlockChain &SuccChain,
519 BranchProbability SuccProb,
520 BranchProbability RealSuccProb,
521 const BlockChain &Chain,
522 const BlockFilterSet *BlockFilter);
523 BlockAndTailDupResult selectBestSuccessor(const MachineBasicBlock *BB,
524 const BlockChain &Chain,
525 const BlockFilterSet *BlockFilter);
526 MachineBasicBlock *
527 selectBestCandidateBlock(const BlockChain &Chain,
528 SmallVectorImpl<MachineBasicBlock *> &WorkList);
529 MachineBasicBlock *
530 getFirstUnplacedBlock(const BlockChain &PlacedChain,
531 MachineFunction::iterator &PrevUnplacedBlockIt);
532 MachineBasicBlock *
533 getFirstUnplacedBlock(const BlockChain &PlacedChain,
534 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
535 const BlockFilterSet *BlockFilter);
536
537 /// Add a basic block to the work list if it is appropriate.
538 ///
539 /// If the optional parameter BlockFilter is provided, only MBB
540 /// present in the set will be added to the worklist. If nullptr
541 /// is provided, no filtering occurs.
542 void fillWorkLists(const MachineBasicBlock *MBB,
543 SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
544 const BlockFilterSet *BlockFilter);
545
546 void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
547 BlockFilterSet *BlockFilter = nullptr);
548 bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
549 const MachineBasicBlock *OldTop);
550 bool hasViableTopFallthrough(const MachineBasicBlock *Top,
551 const BlockFilterSet &LoopBlockSet);
552 BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
553 const BlockFilterSet &LoopBlockSet);
554 BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
555 const MachineBasicBlock *OldTop,
556 const MachineBasicBlock *ExitBB,
557 const BlockFilterSet &LoopBlockSet);
558 MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
559 const MachineLoop &L,
560 const BlockFilterSet &LoopBlockSet);
561 MachineBasicBlock *findBestLoopTop(const MachineLoop &L,
562 const BlockFilterSet &LoopBlockSet);
563 MachineBasicBlock *findBestLoopExit(const MachineLoop &L,
564 const BlockFilterSet &LoopBlockSet,
565 BlockFrequency &ExitFreq);
566 BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
567 void buildLoopChains(const MachineLoop &L);
568 void rotateLoop(BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
569 BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
570 void rotateLoopWithProfile(BlockChain &LoopChain, const MachineLoop &L,
571 const BlockFilterSet &LoopBlockSet);
572 void buildCFGChains();
573 void optimizeBranches();
574 void alignBlocks();
575 /// Returns true if a block should be tail-duplicated to increase fallthrough
576 /// opportunities.
577 bool shouldTailDuplicate(MachineBasicBlock *BB);
578 /// Check the edge frequencies to see if tail duplication will increase
579 /// fallthroughs.
580 bool isProfitableToTailDup(const MachineBasicBlock *BB,
581 const MachineBasicBlock *Succ,
582 BranchProbability QProb, const BlockChain &Chain,
583 const BlockFilterSet *BlockFilter);
584
585 /// Check for a trellis layout.
586 bool isTrellis(const MachineBasicBlock *BB,
587 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
588 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
589
590 /// Get the best successor given a trellis layout.
591 BlockAndTailDupResult getBestTrellisSuccessor(
592 const MachineBasicBlock *BB,
593 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
594 BranchProbability AdjustedSumProb, const BlockChain &Chain,
595 const BlockFilterSet *BlockFilter);
596
597 /// Get the best pair of non-conflicting edges.
598 static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
599 const MachineBasicBlock *BB,
601
602 /// Returns true if a block can tail duplicate into all unplaced
603 /// predecessors. Filters based on loop.
604 bool canTailDuplicateUnplacedPreds(const MachineBasicBlock *BB,
605 MachineBasicBlock *Succ,
606 const BlockChain &Chain,
607 const BlockFilterSet *BlockFilter);
608
609 /// Find chains of triangles to tail-duplicate where a global analysis works,
610 /// but a local analysis would not find them.
611 void precomputeTriangleChains();
612
613 /// Apply a post-processing step optimizing block placement.
614 void applyExtTsp(bool OptForSize);
615
616 /// Modify the existing block placement in the function and adjust all jumps.
617 void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
618
619 /// Create a single CFG chain from the current block order.
620 void createCFGChainExtTsp();
621
622public:
623 MachineBlockPlacement(const MachineBranchProbabilityInfo *MBPI,
624 MachineLoopInfo *MLI, ProfileSummaryInfo *PSI,
625 std::unique_ptr<MBFIWrapper> MBFI,
626 MachinePostDominatorTree *MPDT, bool AllowTailMerge)
627 : MBPI(MBPI), MBFI(std::move(MBFI)), MLI(MLI), MPDT(MPDT), PSI(PSI),
628 AllowTailMerge(AllowTailMerge) {};
629
630 bool run(MachineFunction &F);
631
632 static bool allowTailDupPlacement(MachineFunction &MF) {
634 }
635};
636
637class MachineBlockPlacementLegacy : public MachineFunctionPass {
638public:
639 static char ID; // Pass identification, replacement for typeid
640
641 MachineBlockPlacementLegacy() : MachineFunctionPass(ID) {}
642
643 bool runOnMachineFunction(MachineFunction &MF) override {
644 if (skipFunction(MF.getFunction()))
645 return false;
646
647 auto *MBPI =
648 &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
649 auto MBFI = std::make_unique<MBFIWrapper>(
650 getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI());
651 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
652 auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF)
653 ? &getAnalysis<MachinePostDominatorTreeWrapperPass>()
654 .getPostDomTree()
655 : nullptr;
656 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
657 auto *PassConfig = &getAnalysis<TargetPassConfig>();
658 bool AllowTailMerge = PassConfig->getEnableTailMerge();
659 return MachineBlockPlacement(MBPI, MLI, PSI, std::move(MBFI), MPDT,
660 AllowTailMerge)
661 .run(MF);
662 }
663
664 void getAnalysisUsage(AnalysisUsage &AU) const override {
665 AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
666 AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
668 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
669 AU.addRequired<MachineLoopInfoWrapperPass>();
670 AU.addRequired<ProfileSummaryInfoWrapperPass>();
671 AU.addRequired<TargetPassConfig>();
673 }
674};
675
676} // end anonymous namespace
677
678char MachineBlockPlacementLegacy::ID = 0;
679
680char &llvm::MachineBlockPlacementID = MachineBlockPlacementLegacy::ID;
681
682INITIALIZE_PASS_BEGIN(MachineBlockPlacementLegacy, DEBUG_TYPE,
683 "Branch Probability Basic Block Placement", false, false)
689INITIALIZE_PASS_END(MachineBlockPlacementLegacy, DEBUG_TYPE,
690 "Branch Probability Basic Block Placement", false, false)
691
692#ifndef NDEBUG
693/// Helper to print the name of a MBB.
694///
695/// Only used by debug logging.
696static std::string getBlockName(const MachineBasicBlock *BB) {
697 std::string Result;
698 raw_string_ostream OS(Result);
699 OS << printMBBReference(*BB);
700 OS << " ('" << BB->getName() << "')";
701 return Result;
702}
703#endif
704
705/// Mark a chain's successors as having one fewer preds.
706///
707/// When a chain is being merged into the "placed" chain, this routine will
708/// quickly walk the successors of each block in the chain and mark them as
709/// having one fewer active predecessor. It also adds any successors of this
710/// chain which reach the zero-predecessor state to the appropriate worklist.
711void MachineBlockPlacement::markChainSuccessors(
712 const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
713 const BlockFilterSet *BlockFilter) {
714 // Walk all the blocks in this chain, marking their successors as having
715 // a predecessor placed.
716 for (MachineBasicBlock *MBB : Chain) {
717 markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
718 }
719}
720
721/// Mark a single block's successors as having one fewer preds.
722///
723/// Under normal circumstances, this is only called by markChainSuccessors,
724/// but if a block that was to be placed is completely tail-duplicated away,
725/// and was duplicated into the chain end, we need to redo markBlockSuccessors
726/// for just that block.
727void MachineBlockPlacement::markBlockSuccessors(
728 const BlockChain &Chain, const MachineBasicBlock *MBB,
729 const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
730 // Add any successors for which this is the only un-placed in-loop
731 // predecessor to the worklist as a viable candidate for CFG-neutral
732 // placement. No subsequent placement of this block will violate the CFG
733 // shape, so we get to use heuristics to choose a favorable placement.
734 for (MachineBasicBlock *Succ : MBB->successors()) {
735 if (BlockFilter && !BlockFilter->count(Succ))
736 continue;
737 BlockChain &SuccChain = *BlockToChain[Succ];
738 // Disregard edges within a fixed chain, or edges to the loop header.
739 if (&Chain == &SuccChain || Succ == LoopHeaderBB)
740 continue;
741
742 // This is a cross-chain edge that is within the loop, so decrement the
743 // loop predecessor count of the destination chain.
744 if (SuccChain.UnscheduledPredecessors == 0 ||
745 --SuccChain.UnscheduledPredecessors > 0)
746 continue;
747
748 auto *NewBB = *SuccChain.begin();
749 if (NewBB->isEHPad())
750 EHPadWorkList.push_back(NewBB);
751 else
752 BlockWorkList.push_back(NewBB);
753 }
754}
755
756/// This helper function collects the set of successors of block
757/// \p BB that are allowed to be its layout successors, and return
758/// the total branch probability of edges from \p BB to those
759/// blocks.
760BranchProbability MachineBlockPlacement::collectViableSuccessors(
761 const MachineBasicBlock *BB, const BlockChain &Chain,
762 const BlockFilterSet *BlockFilter,
763 SmallVector<MachineBasicBlock *, 4> &Successors) {
764 // Adjust edge probabilities by excluding edges pointing to blocks that is
765 // either not in BlockFilter or is already in the current chain. Consider the
766 // following CFG:
767 //
768 // --->A
769 // | / \
770 // | B C
771 // | \ / \
772 // ----D E
773 //
774 // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
775 // A->C is chosen as a fall-through, D won't be selected as a successor of C
776 // due to CFG constraint (the probability of C->D is not greater than
777 // HotProb to break topo-order). If we exclude E that is not in BlockFilter
778 // when calculating the probability of C->D, D will be selected and we
779 // will get A C D B as the layout of this loop.
780 auto AdjustedSumProb = BranchProbability::getOne();
781 for (MachineBasicBlock *Succ : BB->successors()) {
782 bool SkipSucc = false;
783 if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
784 SkipSucc = true;
785 } else {
786 BlockChain *SuccChain = BlockToChain[Succ];
787 if (SuccChain == &Chain) {
788 SkipSucc = true;
789 } else if (Succ != *SuccChain->begin()) {
790 LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
791 << " -> Mid chain!\n");
792 continue;
793 }
794 }
795 if (SkipSucc)
796 AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
797 else
798 Successors.push_back(Succ);
799 }
800
801 return AdjustedSumProb;
802}
803
804/// The helper function returns the branch probability that is adjusted
805/// or normalized over the new total \p AdjustedSumProb.
806static BranchProbability
808 BranchProbability AdjustedSumProb) {
809 BranchProbability SuccProb;
810 uint32_t SuccProbN = OrigProb.getNumerator();
811 uint32_t SuccProbD = AdjustedSumProb.getNumerator();
812 if (SuccProbN >= SuccProbD)
813 SuccProb = BranchProbability::getOne();
814 else
815 SuccProb = BranchProbability(SuccProbN, SuccProbD);
816
817 return SuccProb;
818}
819
820/// Check if \p BB has exactly the successors in \p Successors.
821static bool
824 if (BB.succ_size() != Successors.size())
825 return false;
826 // We don't want to count self-loops
827 if (Successors.count(&BB))
828 return false;
829 for (MachineBasicBlock *Succ : BB.successors())
830 if (!Successors.count(Succ))
831 return false;
832 return true;
833}
834
835/// Check if a block should be tail duplicated to increase fallthrough
836/// opportunities.
837/// \p BB Block to check.
838bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
839 // Blocks with single successors don't create additional fallthrough
840 // opportunities. Don't duplicate them. TODO: When conditional exits are
841 // analyzable, allow them to be duplicated.
842 bool IsSimple = TailDup.isSimpleBB(BB);
843
844 if (BB->succ_size() == 1)
845 return false;
846 return TailDup.shouldTailDuplicate(IsSimple, *BB);
847}
848
849/// Compare 2 BlockFrequency's with a small penalty for \p A.
850/// In order to be conservative, we apply a X% penalty to account for
851/// increased icache pressure and static heuristics. For small frequencies
852/// we use only the numerators to improve accuracy. For simplicity, we assume
853/// the penalty is less than 100%
854/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
856 BlockFrequency EntryFreq) {
857 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
858 BlockFrequency Gain = A - B;
859 return (Gain / ThresholdProb) >= EntryFreq;
860}
861
862/// Check the edge frequencies to see if tail duplication will increase
863/// fallthroughs. It only makes sense to call this function when
864/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
865/// always locally profitable if we would have picked \p Succ without
866/// considering duplication.
867bool MachineBlockPlacement::isProfitableToTailDup(
868 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
869 BranchProbability QProb, const BlockChain &Chain,
870 const BlockFilterSet *BlockFilter) {
871 // We need to do a probability calculation to make sure this is profitable.
872 // First: does succ have a successor that post-dominates? This affects the
873 // calculation. The 2 relevant cases are:
874 // BB BB
875 // | \Qout | \Qout
876 // P| C |P C
877 // = C' = C'
878 // | /Qin | /Qin
879 // | / | /
880 // Succ Succ
881 // / \ | \ V
882 // U/ =V |U \
883 // / \ = D
884 // D E | /
885 // | /
886 // |/
887 // PDom
888 // '=' : Branch taken for that CFG edge
889 // In the second case, Placing Succ while duplicating it into C prevents the
890 // fallthrough of Succ into either D or PDom, because they now have C as an
891 // unplaced predecessor
892
893 // Start by figuring out which case we fall into
894 MachineBasicBlock *PDom = nullptr;
895 SmallVector<MachineBasicBlock *, 4> SuccSuccs;
896 // Only scan the relevant successors
897 auto AdjustedSuccSumProb =
898 collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
899 BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
900 auto BBFreq = MBFI->getBlockFreq(BB);
901 auto SuccFreq = MBFI->getBlockFreq(Succ);
902 BlockFrequency P = BBFreq * PProb;
903 BlockFrequency Qout = BBFreq * QProb;
904 BlockFrequency EntryFreq = MBFI->getEntryFreq();
905 // If there are no more successors, it is profitable to copy, as it strictly
906 // increases fallthrough.
907 if (SuccSuccs.size() == 0)
908 return greaterWithBias(P, Qout, EntryFreq);
909
910 auto BestSuccSucc = BranchProbability::getZero();
911 // Find the PDom or the best Succ if no PDom exists.
912 for (MachineBasicBlock *SuccSucc : SuccSuccs) {
913 auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
914 if (Prob > BestSuccSucc)
915 BestSuccSucc = Prob;
916 if (PDom == nullptr)
917 if (MPDT->dominates(SuccSucc, Succ)) {
918 PDom = SuccSucc;
919 break;
920 }
921 }
922 // For the comparisons, we need to know Succ's best incoming edge that isn't
923 // from BB.
924 auto SuccBestPred = BlockFrequency(0);
925 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
926 if (SuccPred == Succ || SuccPred == BB ||
927 BlockToChain[SuccPred] == &Chain ||
928 (BlockFilter && !BlockFilter->count(SuccPred)))
929 continue;
930 auto Freq =
931 MBFI->getBlockFreq(SuccPred) * MBPI->getEdgeProbability(SuccPred, Succ);
932 if (Freq > SuccBestPred)
933 SuccBestPred = Freq;
934 }
935 // Qin is Succ's best unplaced incoming edge that isn't BB
936 BlockFrequency Qin = SuccBestPred;
937 // If it doesn't have a post-dominating successor, here is the calculation:
938 // BB BB
939 // | \Qout | \
940 // P| C | =
941 // = C' | C
942 // | /Qin | |
943 // | / | C' (+Succ)
944 // Succ Succ /|
945 // / \ | \/ |
946 // U/ =V | == |
947 // / \ | / \|
948 // D E D E
949 // '=' : Branch taken for that CFG edge
950 // Cost in the first case is: P + V
951 // For this calculation, we always assume P > Qout. If Qout > P
952 // The result of this function will be ignored at the caller.
953 // Let F = SuccFreq - Qin
954 // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
955
956 if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
957 BranchProbability UProb = BestSuccSucc;
958 BranchProbability VProb = AdjustedSuccSumProb - UProb;
959 BlockFrequency F = SuccFreq - Qin;
960 BlockFrequency V = SuccFreq * VProb;
961 BlockFrequency QinU = std::min(Qin, F) * UProb;
962 BlockFrequency BaseCost = P + V;
963 BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
964 return greaterWithBias(BaseCost, DupCost, EntryFreq);
965 }
966 BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
967 BranchProbability VProb = AdjustedSuccSumProb - UProb;
968 BlockFrequency U = SuccFreq * UProb;
969 BlockFrequency V = SuccFreq * VProb;
970 BlockFrequency F = SuccFreq - Qin;
971 // If there is a post-dominating successor, here is the calculation:
972 // BB BB BB BB
973 // | \Qout | \ | \Qout | \
974 // |P C | = |P C | =
975 // = C' |P C = C' |P C
976 // | /Qin | | | /Qin | |
977 // | / | C' (+Succ) | / | C' (+Succ)
978 // Succ Succ /| Succ Succ /|
979 // | \ V | \/ | | \ V | \/ |
980 // |U \ |U /\ =? |U = |U /\ |
981 // = D = = =?| | D | = =|
982 // | / |/ D | / |/ D
983 // | / | / | = | /
984 // |/ | / |/ | =
985 // Dom Dom Dom Dom
986 // '=' : Branch taken for that CFG edge
987 // The cost for taken branches in the first case is P + U
988 // Let F = SuccFreq - Qin
989 // The cost in the second case (assuming independence), given the layout:
990 // BB, Succ, (C+Succ), D, Dom or the layout:
991 // BB, Succ, D, Dom, (C+Succ)
992 // is Qout + max(F, Qin) * U + min(F, Qin)
993 // compare P + U vs Qout + P * U + Qin.
994 //
995 // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
996 //
997 // For the 3rd case, the cost is P + 2 * V
998 // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
999 // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
1000 if (UProb > AdjustedSuccSumProb / 2 &&
1001 !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
1002 Chain, BlockFilter))
1003 // Cases 3 & 4
1004 return greaterWithBias(
1005 (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
1006 EntryFreq);
1007 // Cases 1 & 2
1008 return greaterWithBias((P + U),
1009 (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
1010 std::max(Qin, F) * UProb),
1011 EntryFreq);
1012}
1013
1014/// Check for a trellis layout. \p BB is the upper part of a trellis if its
1015/// successors form the lower part of a trellis. A successor set S forms the
1016/// lower part of a trellis if all of the predecessors of S are either in S or
1017/// have all of S as successors. We ignore trellises where BB doesn't have 2
1018/// successors because for fewer than 2, it's trivial, and for 3 or greater they
1019/// are very uncommon and complex to compute optimally. Allowing edges within S
1020/// is not strictly a trellis, but the same algorithm works, so we allow it.
1021bool MachineBlockPlacement::isTrellis(
1022 const MachineBasicBlock *BB,
1023 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1024 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1025 // Technically BB could form a trellis with branching factor higher than 2.
1026 // But that's extremely uncommon.
1027 if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
1028 return false;
1029
1030 SmallPtrSet<const MachineBasicBlock *, 2> Successors(llvm::from_range,
1031 BB->successors());
1032 // To avoid reviewing the same predecessors twice.
1033 SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;
1034
1035 for (MachineBasicBlock *Succ : ViableSuccs) {
1036 // Compile-time optimization: runtime is quadratic in the number of
1037 // predecessors. For such uncommon cases, exit early.
1038 if (Succ->pred_size() > PredecessorLimit)
1039 return false;
1040
1041 int PredCount = 0;
1042 for (auto *SuccPred : Succ->predecessors()) {
1043 // Allow triangle successors, but don't count them.
1044 if (Successors.count(SuccPred)) {
1045 // Make sure that it is actually a triangle.
1046 for (MachineBasicBlock *CheckSucc : SuccPred->successors())
1047 if (!Successors.count(CheckSucc))
1048 return false;
1049 continue;
1050 }
1051 const BlockChain *PredChain = BlockToChain[SuccPred];
1052 if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
1053 PredChain == &Chain || PredChain == BlockToChain[Succ])
1054 continue;
1055 ++PredCount;
1056 // Perform the successor check only once.
1057 if (!SeenPreds.insert(SuccPred).second)
1058 continue;
1059 if (!hasSameSuccessors(*SuccPred, Successors))
1060 return false;
1061 }
1062 // If one of the successors has only BB as a predecessor, it is not a
1063 // trellis.
1064 if (PredCount < 1)
1065 return false;
1066 }
1067 return true;
1068}
1069
1070/// Pick the highest total weight pair of edges that can both be laid out.
1071/// The edges in \p Edges[0] are assumed to have a different destination than
1072/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
1073/// the individual highest weight edges to the 2 different destinations, or in
1074/// case of a conflict, one of them should be replaced with a 2nd best edge.
1075std::pair<MachineBlockPlacement::WeightedEdge,
1076 MachineBlockPlacement::WeightedEdge>
1077MachineBlockPlacement::getBestNonConflictingEdges(
1078 const MachineBasicBlock *BB,
1080 Edges) {
1081 // Sort the edges, and then for each successor, find the best incoming
1082 // predecessor. If the best incoming predecessors aren't the same,
1083 // then that is clearly the best layout. If there is a conflict, one of the
1084 // successors will have to fallthrough from the second best predecessor. We
1085 // compare which combination is better overall.
1086
1087 // Sort for highest frequency.
1088 auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
1089
1090 llvm::stable_sort(Edges[0], Cmp);
1091 llvm::stable_sort(Edges[1], Cmp);
1092 auto BestA = Edges[0].begin();
1093 auto BestB = Edges[1].begin();
1094 // Arrange for the correct answer to be in BestA and BestB
1095 // If the 2 best edges don't conflict, the answer is already there.
1096 if (BestA->Src == BestB->Src) {
1097 // Compare the total fallthrough of (Best + Second Best) for both pairs
1098 auto SecondBestA = std::next(BestA);
1099 auto SecondBestB = std::next(BestB);
1100 BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
1101 BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
1102 if (BestAScore < BestBScore)
1103 BestA = SecondBestA;
1104 else
1105 BestB = SecondBestB;
1106 }
1107 // Arrange for the BB edge to be in BestA if it exists.
1108 if (BestB->Src == BB)
1109 std::swap(BestA, BestB);
1110 return std::make_pair(*BestA, *BestB);
1111}
1112
1113/// Get the best successor from \p BB based on \p BB being part of a trellis.
1114/// We only handle trellises with 2 successors, so the algorithm is
1115/// straightforward: Find the best pair of edges that don't conflict. We find
1116/// the best incoming edge for each successor in the trellis. If those conflict,
1117/// we consider which of them should be replaced with the second best.
1118/// Upon return the two best edges will be in \p BestEdges. If one of the edges
1119/// comes from \p BB, it will be in \p BestEdges[0]
1120MachineBlockPlacement::BlockAndTailDupResult
1121MachineBlockPlacement::getBestTrellisSuccessor(
1122 const MachineBasicBlock *BB,
1123 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1124 BranchProbability AdjustedSumProb, const BlockChain &Chain,
1125 const BlockFilterSet *BlockFilter) {
1126
1127 BlockAndTailDupResult Result = {nullptr, false};
1128 SmallPtrSet<const MachineBasicBlock *, 4> Successors(llvm::from_range,
1129 BB->successors());
1130
1131 // We assume size 2 because it's common. For general n, we would have to do
1132 // the Hungarian algorithm, but it's not worth the complexity because more
1133 // than 2 successors is fairly uncommon, and a trellis even more so.
1134 if (Successors.size() != 2 || ViableSuccs.size() != 2)
1135 return Result;
1136
1137 // Collect the edge frequencies of all edges that form the trellis.
1139 int SuccIndex = 0;
1140 for (auto *Succ : ViableSuccs) {
1141 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1142 // Skip any placed predecessors that are not BB
1143 if (SuccPred != BB) {
1144 if (BlockFilter && !BlockFilter->count(SuccPred))
1145 continue;
1146 const BlockChain *SuccPredChain = BlockToChain[SuccPred];
1147 if (SuccPredChain == &Chain || SuccPredChain == BlockToChain[Succ])
1148 continue;
1149 }
1150 BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1151 MBPI->getEdgeProbability(SuccPred, Succ);
1152 Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1153 }
1154 ++SuccIndex;
1155 }
1156
1157 // Pick the best combination of 2 edges from all the edges in the trellis.
1158 WeightedEdge BestA, BestB;
1159 std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1160
1161 if (BestA.Src != BB) {
1162 // If we have a trellis, and BB doesn't have the best fallthrough edges,
1163 // we shouldn't choose any successor. We've already looked and there's a
1164 // better fallthrough edge for all the successors.
1165 LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1166 return Result;
1167 }
1168
1169 // Did we pick the triangle edge? If tail-duplication is profitable, do
1170 // that instead. Otherwise merge the triangle edge now while we know it is
1171 // optimal.
1172 if (BestA.Dest == BestB.Src) {
1173 // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1174 // would be better.
1175 MachineBasicBlock *Succ1 = BestA.Dest;
1176 MachineBasicBlock *Succ2 = BestB.Dest;
1177 // Check to see if tail-duplication would be profitable.
1178 if (allowTailDupPlacement(*F) && shouldTailDuplicate(Succ2) &&
1179 canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1180 isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1181 Chain, BlockFilter)) {
1182 LLVM_DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
1183 MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1184 dbgs() << " Selected: " << getBlockName(Succ2)
1185 << ", probability: " << Succ2Prob
1186 << " (Tail Duplicate)\n");
1187 Result.BB = Succ2;
1188 Result.ShouldTailDup = true;
1189 return Result;
1190 }
1191 }
1192 // We have already computed the optimal edge for the other side of the
1193 // trellis.
1194 ComputedEdges[BestB.Src] = {BestB.Dest, false};
1195
1196 auto TrellisSucc = BestA.Dest;
1197 LLVM_DEBUG(BranchProbability SuccProb = getAdjustedProbability(
1198 MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1199 dbgs() << " Selected: " << getBlockName(TrellisSucc)
1200 << ", probability: " << SuccProb << " (Trellis)\n");
1201 Result.BB = TrellisSucc;
1202 return Result;
1203}
1204
1205/// When the option allowTailDupPlacement() is on, this method checks if the
1206/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1207/// into all of its unplaced, unfiltered predecessors, that are not BB.
1208bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1209 const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1210 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1211 if (!shouldTailDuplicate(Succ))
1212 return false;
1213
1214 // The result of canTailDuplicate.
1215 bool Duplicate = true;
1216 // Number of possible duplication.
1217 unsigned int NumDup = 0;
1218
1219 // For CFG checking.
1220 SmallPtrSet<const MachineBasicBlock *, 4> Successors(llvm::from_range,
1221 BB->successors());
1222 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1223 // Make sure all unplaced and unfiltered predecessors can be
1224 // tail-duplicated into.
1225 // Skip any blocks that are already placed or not in this loop.
1226 if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) ||
1227 (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
1228 continue;
1229 if (!TailDup.canTailDuplicate(Succ, Pred)) {
1230 if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1231 // This will result in a trellis after tail duplication, so we don't
1232 // need to copy Succ into this predecessor. In the presence
1233 // of a trellis tail duplication can continue to be profitable.
1234 // For example:
1235 // A A
1236 // |\ |\
1237 // | \ | \
1238 // | C | C+BB
1239 // | / | |
1240 // |/ | |
1241 // BB => BB |
1242 // |\ |\/|
1243 // | \ |/\|
1244 // | D | D
1245 // | / | /
1246 // |/ |/
1247 // Succ Succ
1248 //
1249 // After BB was duplicated into C, the layout looks like the one on the
1250 // right. BB and C now have the same successors. When considering
1251 // whether Succ can be duplicated into all its unplaced predecessors, we
1252 // ignore C.
1253 // We can do this because C already has a profitable fallthrough, namely
1254 // D. TODO(iteratee): ignore sufficiently cold predecessors for
1255 // duplication and for this test.
1256 //
1257 // This allows trellises to be laid out in 2 separate chains
1258 // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1259 // because it allows the creation of 2 fallthrough paths with links
1260 // between them, and we correctly identify the best layout for these
1261 // CFGs. We want to extend trellises that the user created in addition
1262 // to trellises created by tail-duplication, so we just look for the
1263 // CFG.
1264 continue;
1265 Duplicate = false;
1266 continue;
1267 }
1268 NumDup++;
1269 }
1270
1271 // No possible duplication in current filter set.
1272 if (NumDup == 0)
1273 return false;
1274
1275 // If profile information is available, findDuplicateCandidates can do more
1276 // precise benefit analysis.
1277 if (F->getFunction().hasProfileData())
1278 return true;
1279
1280 // This is mainly for function exit BB.
1281 // The integrated tail duplication is really designed for increasing
1282 // fallthrough from predecessors from Succ to its successors. We may need
1283 // other machanism to handle different cases.
1284 if (Succ->succ_empty())
1285 return true;
1286
1287 // Plus the already placed predecessor.
1288 NumDup++;
1289
1290 // If the duplication candidate has more unplaced predecessors than
1291 // successors, the extra duplication can't bring more fallthrough.
1292 //
1293 // Pred1 Pred2 Pred3
1294 // \ | /
1295 // \ | /
1296 // \ | /
1297 // Dup
1298 // / \
1299 // / \
1300 // Succ1 Succ2
1301 //
1302 // In this example Dup has 2 successors and 3 predecessors, duplication of Dup
1303 // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
1304 // but the duplication into Pred3 can't increase fallthrough.
1305 //
1306 // A small number of extra duplication may not hurt too much. We need a better
1307 // heuristic to handle it.
1308 if ((NumDup > Succ->succ_size()) || !Duplicate)
1309 return false;
1310
1311 return true;
1312}
1313
1314/// Find chains of triangles where we believe it would be profitable to
1315/// tail-duplicate them all, but a local analysis would not find them.
1316/// There are 3 ways this can be profitable:
1317/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1318/// longer chains)
1319/// 2) The chains are statically correlated. Branch probabilities have a very
1320/// U-shaped distribution.
1321/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1322/// If the branches in a chain are likely to be from the same side of the
1323/// distribution as their predecessor, but are independent at runtime, this
1324/// transformation is profitable. (Because the cost of being wrong is a small
1325/// fixed cost, unlike the standard triangle layout where the cost of being
1326/// wrong scales with the # of triangles.)
1327/// 3) The chains are dynamically correlated. If the probability that a previous
1328/// branch was taken positively influences whether the next branch will be
1329/// taken
1330/// We believe that 2 and 3 are common enough to justify the small margin in 1.
1331void MachineBlockPlacement::precomputeTriangleChains() {
1332 struct TriangleChain {
1333 std::vector<MachineBasicBlock *> Edges;
1334
1335 TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1336 : Edges({src, dst}) {}
1337
1338 void append(MachineBasicBlock *dst) {
1339 assert(getKey()->isSuccessor(dst) &&
1340 "Attempting to append a block that is not a successor.");
1341 Edges.push_back(dst);
1342 }
1343
1344 unsigned count() const { return Edges.size() - 1; }
1345
1346 MachineBasicBlock *getKey() const { return Edges.back(); }
1347 };
1348
1349 if (TriangleChainCount == 0)
1350 return;
1351
1352 LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1353 // Map from last block to the chain that contains it. This allows us to extend
1354 // chains as we find new triangles.
1355 DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap;
1356 for (MachineBasicBlock &BB : *F) {
1357 // If BB doesn't have 2 successors, it doesn't start a triangle.
1358 if (BB.succ_size() != 2)
1359 continue;
1360 MachineBasicBlock *PDom = nullptr;
1361 for (MachineBasicBlock *Succ : BB.successors()) {
1362 if (!MPDT->dominates(Succ, &BB))
1363 continue;
1364 PDom = Succ;
1365 break;
1366 }
1367 // If BB doesn't have a post-dominating successor, it doesn't form a
1368 // triangle.
1369 if (PDom == nullptr)
1370 continue;
1371 // If PDom has a hint that it is low probability, skip this triangle.
1372 if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1373 continue;
1374 // If PDom isn't eligible for duplication, this isn't the kind of triangle
1375 // we're looking for.
1376 if (!shouldTailDuplicate(PDom))
1377 continue;
1378 bool CanTailDuplicate = true;
1379 // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1380 // isn't the kind of triangle we're looking for.
1381 for (MachineBasicBlock *Pred : PDom->predecessors()) {
1382 if (Pred == &BB)
1383 continue;
1384 if (!TailDup.canTailDuplicate(PDom, Pred)) {
1385 CanTailDuplicate = false;
1386 break;
1387 }
1388 }
1389 // If we can't tail-duplicate PDom to its predecessors, then skip this
1390 // triangle.
1391 if (!CanTailDuplicate)
1392 continue;
1393
1394 // Now we have an interesting triangle. Insert it if it's not part of an
1395 // existing chain.
1396 // Note: This cannot be replaced with a call insert() or emplace() because
1397 // the find key is BB, but the insert/emplace key is PDom.
1398 auto Found = TriangleChainMap.find(&BB);
1399 // If it is, remove the chain from the map, grow it, and put it back in the
1400 // map with the end as the new key.
1401 if (Found != TriangleChainMap.end()) {
1402 TriangleChain Chain = std::move(Found->second);
1403 TriangleChainMap.erase(Found);
1404 Chain.append(PDom);
1405 TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1406 } else {
1407 auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1408 assert(InsertResult.second && "Block seen twice.");
1409 (void)InsertResult;
1410 }
1411 }
1412
1413 // Iterating over a DenseMap is safe here, because the only thing in the body
1414 // of the loop is inserting into another DenseMap (ComputedEdges).
1415 // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1416 for (auto &ChainPair : TriangleChainMap) {
1417 TriangleChain &Chain = ChainPair.second;
1418 // Benchmarking has shown that due to branch correlation duplicating 2 or
1419 // more triangles is profitable, despite the calculations assuming
1420 // independence.
1421 if (Chain.count() < TriangleChainCount)
1422 continue;
1423 MachineBasicBlock *dst = Chain.Edges.back();
1424 Chain.Edges.pop_back();
1425 for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1426 LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1427 << getBlockName(dst)
1428 << " as pre-computed based on triangles.\n");
1429
1430 auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1431 assert(InsertResult.second && "Block seen twice.");
1432 (void)InsertResult;
1433
1434 dst = src;
1435 }
1436 }
1437}
1438
1439// When profile is not present, return the StaticLikelyProb.
1440// When profile is available, we need to handle the triangle-shape CFG.
1441static BranchProbability
1443 if (!BB->getParent()->getFunction().hasProfileData())
1445 if (BB->succ_size() == 2) {
1446 const MachineBasicBlock *Succ1 = *BB->succ_begin();
1447 const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1448 if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1449 /* See case 1 below for the cost analysis. For BB->Succ to
1450 * be taken with smaller cost, the following needs to hold:
1451 * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1452 * So the threshold T in the calculation below
1453 * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1454 * So T / (1 - T) = 2, Yielding T = 2/3
1455 * Also adding user specified branch bias, we have
1456 * T = (2/3)*(ProfileLikelyProb/50)
1457 * = (2*ProfileLikelyProb)/150)
1458 */
1459 return BranchProbability(2 * ProfileLikelyProb, 150);
1460 }
1461 }
1463}
1464
1465/// Checks to see if the layout candidate block \p Succ has a better layout
1466/// predecessor than \c BB. If yes, returns true.
1467/// \p SuccProb: The probability adjusted for only remaining blocks.
1468/// Only used for logging
1469/// \p RealSuccProb: The un-adjusted probability.
1470/// \p Chain: The chain that BB belongs to and Succ is being considered for.
1471/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1472/// considered
1473bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1474 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1475 const BlockChain &SuccChain, BranchProbability SuccProb,
1476 BranchProbability RealSuccProb, const BlockChain &Chain,
1477 const BlockFilterSet *BlockFilter) {
1478
1479 // There isn't a better layout when there are no unscheduled predecessors.
1480 if (SuccChain.UnscheduledPredecessors == 0)
1481 return false;
1482
1483 // Compile-time optimization: runtime is quadratic in the number of
1484 // predecessors. For such uncommon cases, exit early.
1485 if (Succ->pred_size() > PredecessorLimit)
1486 return false;
1487
1488 // There are two basic scenarios here:
1489 // -------------------------------------
1490 // Case 1: triangular shape CFG (if-then):
1491 // BB
1492 // | \
1493 // | \
1494 // | Pred
1495 // | /
1496 // Succ
1497 // In this case, we are evaluating whether to select edge -> Succ, e.g.
1498 // set Succ as the layout successor of BB. Picking Succ as BB's
1499 // successor breaks the CFG constraints (FIXME: define these constraints).
1500 // With this layout, Pred BB
1501 // is forced to be outlined, so the overall cost will be cost of the
1502 // branch taken from BB to Pred, plus the cost of back taken branch
1503 // from Pred to Succ, as well as the additional cost associated
1504 // with the needed unconditional jump instruction from Pred To Succ.
1505
1506 // The cost of the topological order layout is the taken branch cost
1507 // from BB to Succ, so to make BB->Succ a viable candidate, the following
1508 // must hold:
1509 // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1510 // < freq(BB->Succ) * taken_branch_cost.
1511 // Ignoring unconditional jump cost, we get
1512 // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1513 // prob(BB->Succ) > 2 * prob(BB->Pred)
1514 //
1515 // When real profile data is available, we can precisely compute the
1516 // probability threshold that is needed for edge BB->Succ to be considered.
1517 // Without profile data, the heuristic requires the branch bias to be
1518 // a lot larger to make sure the signal is very strong (e.g. 80% default).
1519 // -----------------------------------------------------------------
1520 // Case 2: diamond like CFG (if-then-else):
1521 // S
1522 // / \
1523 // | \
1524 // BB Pred
1525 // \ /
1526 // Succ
1527 // ..
1528 //
1529 // The current block is BB and edge BB->Succ is now being evaluated.
1530 // Note that edge S->BB was previously already selected because
1531 // prob(S->BB) > prob(S->Pred).
1532 // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1533 // choose Pred, we will have a topological ordering as shown on the left
1534 // in the picture below. If we choose Succ, we have the solution as shown
1535 // on the right:
1536 //
1537 // topo-order:
1538 //
1539 // S----- ---S
1540 // | | | |
1541 // ---BB | | BB
1542 // | | | |
1543 // | Pred-- | Succ--
1544 // | | | |
1545 // ---Succ ---Pred--
1546 //
1547 // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1548 // = freq(S->Pred) + freq(S->BB)
1549 //
1550 // If we have profile data (i.e, branch probabilities can be trusted), the
1551 // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1552 // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1553 // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1554 // means the cost of topological order is greater.
1555 // When profile data is not available, however, we need to be more
1556 // conservative. If the branch prediction is wrong, breaking the topo-order
1557 // will actually yield a layout with large cost. For this reason, we need
1558 // strong biased branch at block S with Prob(S->BB) in order to select
1559 // BB->Succ. This is equivalent to looking the CFG backward with backward
1560 // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1561 // profile data).
1562 // --------------------------------------------------------------------------
1563 // Case 3: forked diamond
1564 // S
1565 // / \
1566 // / \
1567 // BB Pred
1568 // | \ / |
1569 // | \ / |
1570 // | X |
1571 // | / \ |
1572 // | / \ |
1573 // S1 S2
1574 //
1575 // The current block is BB and edge BB->S1 is now being evaluated.
1576 // As above S->BB was already selected because
1577 // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1578 //
1579 // topo-order:
1580 //
1581 // S-------| ---S
1582 // | | | |
1583 // ---BB | | BB
1584 // | | | |
1585 // | Pred----| | S1----
1586 // | | | |
1587 // --(S1 or S2) ---Pred--
1588 // |
1589 // S2
1590 //
1591 // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1592 // + min(freq(Pred->S1), freq(Pred->S2))
1593 // Non-topo-order cost:
1594 // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1595 // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1596 // is 0. Then the non topo layout is better when
1597 // freq(S->Pred) < freq(BB->S1).
1598 // This is exactly what is checked below.
1599 // Note there are other shapes that apply (Pred may not be a single block,
1600 // but they all fit this general pattern.)
1601 BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);
1602
1603 // Make sure that a hot successor doesn't have a globally more
1604 // important predecessor.
1605 BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1606 bool BadCFGConflict = false;
1607
1608 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1609 BlockChain *PredChain = BlockToChain[Pred];
1610 if (Pred == Succ || PredChain == &SuccChain ||
1611 (BlockFilter && !BlockFilter->count(Pred)) || PredChain == &Chain ||
1612 Pred != *std::prev(PredChain->end()) ||
1613 // This check is redundant except for look ahead. This function is
1614 // called for lookahead by isProfitableToTailDup when BB hasn't been
1615 // placed yet.
1616 (Pred == BB))
1617 continue;
1618 // Do backward checking.
1619 // For all cases above, we need a backward checking to filter out edges that
1620 // are not 'strongly' biased.
1621 // BB Pred
1622 // \ /
1623 // Succ
1624 // We select edge BB->Succ if
1625 // freq(BB->Succ) > freq(Succ) * HotProb
1626 // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1627 // HotProb
1628 // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1629 // Case 1 is covered too, because the first equation reduces to:
1630 // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1631 BlockFrequency PredEdgeFreq =
1632 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1633 if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1634 BadCFGConflict = true;
1635 break;
1636 }
1637 }
1638
1639 if (BadCFGConflict) {
1640 LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1641 << SuccProb << " (prob) (non-cold CFG conflict)\n");
1642 return true;
1643 }
1644
1645 return false;
1646}
1647
1648/// Select the best successor for a block.
1649///
1650/// This looks across all successors of a particular block and attempts to
1651/// select the "best" one to be the layout successor. It only considers direct
1652/// successors which also pass the block filter. It will attempt to avoid
1653/// breaking CFG structure, but cave and break such structures in the case of
1654/// very hot successor edges.
1655///
1656/// \returns The best successor block found, or null if none are viable, along
1657/// with a boolean indicating if tail duplication is necessary.
1658MachineBlockPlacement::BlockAndTailDupResult
1659MachineBlockPlacement::selectBestSuccessor(const MachineBasicBlock *BB,
1660 const BlockChain &Chain,
1661 const BlockFilterSet *BlockFilter) {
1662 const BranchProbability HotProb(StaticLikelyProb, 100);
1663
1664 BlockAndTailDupResult BestSucc = {nullptr, false};
1665 auto BestProb = BranchProbability::getZero();
1666
1667 SmallVector<MachineBasicBlock *, 4> Successors;
1668 auto AdjustedSumProb =
1669 collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1670
1671 LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1672 << "\n");
1673
1674 // if we already precomputed the best successor for BB, return that if still
1675 // applicable.
1676 auto FoundEdge = ComputedEdges.find(BB);
1677 if (FoundEdge != ComputedEdges.end()) {
1678 MachineBasicBlock *Succ = FoundEdge->second.BB;
1679 ComputedEdges.erase(FoundEdge);
1680 BlockChain *SuccChain = BlockToChain[Succ];
1681 if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1682 SuccChain != &Chain && Succ == *SuccChain->begin())
1683 return FoundEdge->second;
1684 }
1685
1686 // if BB is part of a trellis, Use the trellis to determine the optimal
1687 // fallthrough edges
1688 if (isTrellis(BB, Successors, Chain, BlockFilter))
1689 return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1690 BlockFilter);
1691
1692 // For blocks with CFG violations, we may be able to lay them out anyway with
1693 // tail-duplication. We keep this vector so we can perform the probability
1694 // calculations the minimum number of times.
1696 DupCandidates;
1697 for (MachineBasicBlock *Succ : Successors) {
1698 auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1699 BranchProbability SuccProb =
1700 getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1701
1702 BlockChain &SuccChain = *BlockToChain[Succ];
1703 // Skip the edge \c BB->Succ if block \c Succ has a better layout
1704 // predecessor that yields lower global cost.
1705 if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1706 Chain, BlockFilter)) {
1707 // If tail duplication would make Succ profitable, place it.
1708 if (allowTailDupPlacement(*F) && shouldTailDuplicate(Succ))
1709 DupCandidates.emplace_back(SuccProb, Succ);
1710 continue;
1711 }
1712
1713 LLVM_DEBUG(
1714 dbgs() << " Candidate: " << getBlockName(Succ)
1715 << ", probability: " << SuccProb
1716 << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1717 << "\n");
1718
1719 if (BestSucc.BB && BestProb >= SuccProb) {
1720 LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1721 continue;
1722 }
1723
1724 LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1725 BestSucc.BB = Succ;
1726 BestProb = SuccProb;
1727 }
1728 // Handle the tail duplication candidates in order of decreasing probability.
1729 // Stop at the first one that is profitable. Also stop if they are less
1730 // profitable than BestSucc. Position is important because we preserve it and
1731 // prefer first best match. Here we aren't comparing in order, so we capture
1732 // the position instead.
1733 llvm::stable_sort(DupCandidates,
1734 [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1735 std::tuple<BranchProbability, MachineBasicBlock *> R) {
1736 return std::get<0>(L) > std::get<0>(R);
1737 });
1738 for (auto &Tup : DupCandidates) {
1739 BranchProbability DupProb;
1740 MachineBasicBlock *Succ;
1741 std::tie(DupProb, Succ) = Tup;
1742 if (DupProb < BestProb)
1743 break;
1744 if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) &&
1745 (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1746 LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1747 << ", probability: " << DupProb
1748 << " (Tail Duplicate)\n");
1749 BestSucc.BB = Succ;
1750 BestSucc.ShouldTailDup = true;
1751 break;
1752 }
1753 }
1754
1755 if (BestSucc.BB)
1756 LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1757
1758 return BestSucc;
1759}
1760
1761/// Select the best block from a worklist.
1762///
1763/// This looks through the provided worklist as a list of candidate basic
1764/// blocks and select the most profitable one to place. The definition of
1765/// profitable only really makes sense in the context of a loop. This returns
1766/// the most frequently visited block in the worklist, which in the case of
1767/// a loop, is the one most desirable to be physically close to the rest of the
1768/// loop body in order to improve i-cache behavior.
1769///
1770/// \returns The best block found, or null if none are viable.
1771MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1772 const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1773 // Once we need to walk the worklist looking for a candidate, cleanup the
1774 // worklist of already placed entries.
1775 // FIXME: If this shows up on profiles, it could be folded (at the cost of
1776 // some code complexity) into the loop below.
1777 llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) {
1778 return BlockToChain.lookup(BB) == &Chain;
1779 });
1780
1781 if (WorkList.empty())
1782 return nullptr;
1783
1784 bool IsEHPad = WorkList[0]->isEHPad();
1785
1786 MachineBasicBlock *BestBlock = nullptr;
1787 BlockFrequency BestFreq;
1788 for (MachineBasicBlock *MBB : WorkList) {
1789 assert(MBB->isEHPad() == IsEHPad &&
1790 "EHPad mismatch between block and work list.");
1791
1792 BlockChain &SuccChain = *BlockToChain[MBB];
1793 if (&SuccChain == &Chain)
1794 continue;
1795
1796 assert(SuccChain.UnscheduledPredecessors == 0 &&
1797 "Found CFG-violating block");
1798
1799 BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1800 LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> "
1801 << printBlockFreq(MBFI->getMBFI(), CandidateFreq)
1802 << " (freq)\n");
1803
1804 // For ehpad, we layout the least probable first as to avoid jumping back
1805 // from least probable landingpads to more probable ones.
1806 //
1807 // FIXME: Using probability is probably (!) not the best way to achieve
1808 // this. We should probably have a more principled approach to layout
1809 // cleanup code.
1810 //
1811 // The goal is to get:
1812 //
1813 // +--------------------------+
1814 // | V
1815 // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1816 //
1817 // Rather than:
1818 //
1819 // +-------------------------------------+
1820 // V |
1821 // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1822 if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1823 continue;
1824
1825 BestBlock = MBB;
1826 BestFreq = CandidateFreq;
1827 }
1828
1829 return BestBlock;
1830}
1831
1832/// Retrieve the first unplaced basic block in the entire function.
1833///
1834/// This routine is called when we are unable to use the CFG to walk through
1835/// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1836/// We walk through the function's blocks in order, starting from the
1837/// LastUnplacedBlockIt. We update this iterator on each call to avoid
1838/// re-scanning the entire sequence on repeated calls to this routine.
1839MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1840 const BlockChain &PlacedChain,
1841 MachineFunction::iterator &PrevUnplacedBlockIt) {
1842
1843 for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1844 ++I) {
1845 if (BlockChain *Chain = BlockToChain[&*I]; Chain != &PlacedChain) {
1846 PrevUnplacedBlockIt = I;
1847 // Now select the head of the chain to which the unplaced block belongs
1848 // as the block to place. This will force the entire chain to be placed,
1849 // and satisfies the requirements of merging chains.
1850 return *Chain->begin();
1851 }
1852 }
1853 return nullptr;
1854}
1855
1856/// Retrieve the first unplaced basic block among the blocks in BlockFilter.
1857///
1858/// This is similar to getFirstUnplacedBlock for the entire function, but since
1859/// the size of BlockFilter is typically far less than the number of blocks in
1860/// the entire function, iterating through the BlockFilter is more efficient.
1861/// When processing the entire funciton, using the version without BlockFilter
1862/// has a complexity of #(loops in function) * #(blocks in function), while this
1863/// version has a complexity of sum(#(loops in block) foreach block in function)
1864/// which is always smaller. For long function mostly sequential in structure,
1865/// the complexity is amortized to 1 * #(blocks in function).
1866MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1867 const BlockChain &PlacedChain,
1868 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
1869 const BlockFilterSet *BlockFilter) {
1870 assert(BlockFilter);
1871 for (; PrevUnplacedBlockInFilterIt != BlockFilter->end();
1872 ++PrevUnplacedBlockInFilterIt) {
1873 BlockChain *C = BlockToChain[*PrevUnplacedBlockInFilterIt];
1874 if (C != &PlacedChain) {
1875 return *C->begin();
1876 }
1877 }
1878 return nullptr;
1879}
1880
1881void MachineBlockPlacement::fillWorkLists(
1882 const MachineBasicBlock *MBB, SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1883 const BlockFilterSet *BlockFilter = nullptr) {
1884 BlockChain &Chain = *BlockToChain[MBB];
1885 if (!UpdatedPreds.insert(&Chain).second)
1886 return;
1887
1888 assert(
1889 Chain.UnscheduledPredecessors == 0 &&
1890 "Attempting to place block with unscheduled predecessors in worklist.");
1891 for (MachineBasicBlock *ChainBB : Chain) {
1892 assert(BlockToChain[ChainBB] == &Chain &&
1893 "Block in chain doesn't match BlockToChain map.");
1894 for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1895 if (BlockFilter && !BlockFilter->count(Pred))
1896 continue;
1897 if (BlockToChain[Pred] == &Chain)
1898 continue;
1899 ++Chain.UnscheduledPredecessors;
1900 }
1901 }
1902
1903 if (Chain.UnscheduledPredecessors != 0)
1904 return;
1905
1906 MachineBasicBlock *BB = *Chain.begin();
1907 if (BB->isEHPad())
1908 EHPadWorkList.push_back(BB);
1909 else
1910 BlockWorkList.push_back(BB);
1911}
1912
1913void MachineBlockPlacement::buildChain(const MachineBasicBlock *HeadBB,
1914 BlockChain &Chain,
1915 BlockFilterSet *BlockFilter) {
1916 assert(HeadBB && "BB must not be null.\n");
1917 assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1918 MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1919 BlockFilterSet::iterator PrevUnplacedBlockInFilterIt;
1920 if (BlockFilter)
1921 PrevUnplacedBlockInFilterIt = BlockFilter->begin();
1922
1923 const MachineBasicBlock *LoopHeaderBB = HeadBB;
1924 markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1925 MachineBasicBlock *BB = *std::prev(Chain.end());
1926 while (true) {
1927 assert(BB && "null block found at end of chain in loop.");
1928 assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1929 assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1930
1931 // Look for the best viable successor if there is one to place immediately
1932 // after this block.
1933 auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1934 MachineBasicBlock *BestSucc = Result.BB;
1935 bool ShouldTailDup = Result.ShouldTailDup;
1936 if (allowTailDupPlacement(*F))
1937 ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(
1938 BB, BestSucc, Chain, BlockFilter));
1939
1940 // If an immediate successor isn't available, look for the best viable
1941 // block among those we've identified as not violating the loop's CFG at
1942 // this point. This won't be a fallthrough, but it will increase locality.
1943 if (!BestSucc)
1944 BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1945 if (!BestSucc)
1946 BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1947
1948 if (!BestSucc) {
1949 if (BlockFilter)
1950 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockInFilterIt,
1951 BlockFilter);
1952 else
1953 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt);
1954 if (!BestSucc)
1955 break;
1956
1957 LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1958 "layout successor until the CFG reduces\n");
1959 }
1960
1961 // Placement may have changed tail duplication opportunities.
1962 // Check for that now.
1963 if (allowTailDupPlacement(*F) && BestSucc && ShouldTailDup) {
1964 repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1965 BlockFilter, PrevUnplacedBlockIt,
1966 PrevUnplacedBlockInFilterIt);
1967 // If the chosen successor was duplicated into BB, don't bother laying
1968 // it out, just go round the loop again with BB as the chain end.
1969 if (!BB->isSuccessor(BestSucc))
1970 continue;
1971 }
1972
1973 // Place this block, updating the datastructures to reflect its placement.
1974 BlockChain &SuccChain = *BlockToChain[BestSucc];
1975 // Zero out UnscheduledPredecessors for the successor we're about to merge
1976 // in case we selected a successor that didn't fit naturally into the CFG.
1977 SuccChain.UnscheduledPredecessors = 0;
1978 LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1979 << getBlockName(BestSucc) << "\n");
1980 markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1981 Chain.merge(BestSucc, &SuccChain);
1982 BB = *std::prev(Chain.end());
1983 }
1984
1985 LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1986 << getBlockName(*Chain.begin()) << "\n");
1987}
1988
1989// If bottom of block BB has only one successor OldTop, in most cases it is
1990// profitable to move it before OldTop, except the following case:
1991//
1992// -->OldTop<-
1993// | . |
1994// | . |
1995// | . |
1996// ---Pred |
1997// | |
1998// BB-----
1999//
2000// If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
2001// layout the other successor below it, so it can't reduce taken branch.
2002// In this case we keep its original layout.
2003bool MachineBlockPlacement::canMoveBottomBlockToTop(
2004 const MachineBasicBlock *BottomBlock, const MachineBasicBlock *OldTop) {
2005 if (BottomBlock->pred_size() != 1)
2006 return true;
2007 MachineBasicBlock *Pred = *BottomBlock->pred_begin();
2008 if (Pred->succ_size() != 2)
2009 return true;
2010
2011 MachineBasicBlock *OtherBB = *Pred->succ_begin();
2012 if (OtherBB == BottomBlock)
2013 OtherBB = *Pred->succ_rbegin();
2014 if (OtherBB == OldTop)
2015 return false;
2016
2017 return true;
2018}
2019
2020// Find out the possible fall through frequence to the top of a loop.
2021BlockFrequency
2022MachineBlockPlacement::TopFallThroughFreq(const MachineBasicBlock *Top,
2023 const BlockFilterSet &LoopBlockSet) {
2024 BlockFrequency MaxFreq = BlockFrequency(0);
2025 for (MachineBasicBlock *Pred : Top->predecessors()) {
2026 BlockChain *PredChain = BlockToChain[Pred];
2027 if (!LoopBlockSet.count(Pred) &&
2028 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2029 // Found a Pred block can be placed before Top.
2030 // Check if Top is the best successor of Pred.
2031 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2032 bool TopOK = true;
2033 for (MachineBasicBlock *Succ : Pred->successors()) {
2034 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2035 BlockChain *SuccChain = BlockToChain[Succ];
2036 // Check if Succ can be placed after Pred.
2037 // Succ should not be in any chain, or it is the head of some chain.
2038 if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
2039 (!SuccChain || Succ == *SuccChain->begin())) {
2040 TopOK = false;
2041 break;
2042 }
2043 }
2044 if (TopOK) {
2045 BlockFrequency EdgeFreq =
2046 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Top);
2047 if (EdgeFreq > MaxFreq)
2048 MaxFreq = EdgeFreq;
2049 }
2050 }
2051 }
2052 return MaxFreq;
2053}
2054
2055// Compute the fall through gains when move NewTop before OldTop.
2056//
2057// In following diagram, edges marked as "-" are reduced fallthrough, edges
2058// marked as "+" are increased fallthrough, this function computes
2059//
2060// SUM(increased fallthrough) - SUM(decreased fallthrough)
2061//
2062// |
2063// | -
2064// V
2065// --->OldTop
2066// | .
2067// | .
2068// +| . +
2069// | Pred --->
2070// | |-
2071// | V
2072// --- NewTop <---
2073// |-
2074// V
2075//
2076BlockFrequency MachineBlockPlacement::FallThroughGains(
2077 const MachineBasicBlock *NewTop, const MachineBasicBlock *OldTop,
2078 const MachineBasicBlock *ExitBB, const BlockFilterSet &LoopBlockSet) {
2079 BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
2080 BlockFrequency FallThrough2Exit = BlockFrequency(0);
2081 if (ExitBB)
2082 FallThrough2Exit =
2083 MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, ExitBB);
2084 BlockFrequency BackEdgeFreq =
2085 MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, OldTop);
2086
2087 // Find the best Pred of NewTop.
2088 MachineBasicBlock *BestPred = nullptr;
2089 BlockFrequency FallThroughFromPred = BlockFrequency(0);
2090 for (MachineBasicBlock *Pred : NewTop->predecessors()) {
2091 if (!LoopBlockSet.count(Pred))
2092 continue;
2093 BlockChain *PredChain = BlockToChain[Pred];
2094 if (!PredChain || Pred == *std::prev(PredChain->end())) {
2095 BlockFrequency EdgeFreq =
2096 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, NewTop);
2097 if (EdgeFreq > FallThroughFromPred) {
2098 FallThroughFromPred = EdgeFreq;
2099 BestPred = Pred;
2100 }
2101 }
2102 }
2103
2104 // If NewTop is not placed after Pred, another successor can be placed
2105 // after Pred.
2106 BlockFrequency NewFreq = BlockFrequency(0);
2107 if (BestPred) {
2108 for (MachineBasicBlock *Succ : BestPred->successors()) {
2109 if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
2110 continue;
2111 if (ComputedEdges.contains(Succ))
2112 continue;
2113 BlockChain *SuccChain = BlockToChain[Succ];
2114 if ((SuccChain && (Succ != *SuccChain->begin())) ||
2115 (SuccChain == BlockToChain[BestPred]))
2116 continue;
2117 BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
2118 MBPI->getEdgeProbability(BestPred, Succ);
2119 if (EdgeFreq > NewFreq)
2120 NewFreq = EdgeFreq;
2121 }
2122 BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
2123 MBPI->getEdgeProbability(BestPred, NewTop);
2124 if (NewFreq > OrigEdgeFreq) {
2125 // If NewTop is not the best successor of Pred, then Pred doesn't
2126 // fallthrough to NewTop. So there is no FallThroughFromPred and
2127 // NewFreq.
2128 NewFreq = BlockFrequency(0);
2129 FallThroughFromPred = BlockFrequency(0);
2130 }
2131 }
2132
2133 BlockFrequency Result = BlockFrequency(0);
2134 BlockFrequency Gains = BackEdgeFreq + NewFreq;
2135 BlockFrequency Lost =
2136 FallThrough2Top + FallThrough2Exit + FallThroughFromPred;
2137 if (Gains > Lost)
2138 Result = Gains - Lost;
2139 return Result;
2140}
2141
2142/// Helper function of findBestLoopTop. Find the best loop top block
2143/// from predecessors of old top.
2144///
2145/// Look for a block which is strictly better than the old top for laying
2146/// out before the old top of the loop. This looks for only two patterns:
2147///
2148/// 1. a block has only one successor, the old loop top
2149///
2150/// Because such a block will always result in an unconditional jump,
2151/// rotating it in front of the old top is always profitable.
2152///
2153/// 2. a block has two successors, one is old top, another is exit
2154/// and it has more than one predecessors
2155///
2156/// If it is below one of its predecessors P, only P can fall through to
2157/// it, all other predecessors need a jump to it, and another conditional
2158/// jump to loop header. If it is moved before loop header, all its
2159/// predecessors jump to it, then fall through to loop header. So all its
2160/// predecessors except P can reduce one taken branch.
2161/// At the same time, move it before old top increases the taken branch
2162/// to loop exit block, so the reduced taken branch will be compared with
2163/// the increased taken branch to the loop exit block.
2164MachineBasicBlock *MachineBlockPlacement::findBestLoopTopHelper(
2165 MachineBasicBlock *OldTop, const MachineLoop &L,
2166 const BlockFilterSet &LoopBlockSet) {
2167 // Check that the header hasn't been fused with a preheader block due to
2168 // crazy branches. If it has, we need to start with the header at the top to
2169 // prevent pulling the preheader into the loop body.
2170 BlockChain &HeaderChain = *BlockToChain[OldTop];
2171 if (!LoopBlockSet.count(*HeaderChain.begin()))
2172 return OldTop;
2173 if (OldTop != *HeaderChain.begin())
2174 return OldTop;
2175
2176 LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
2177 << "\n");
2178
2179 BlockFrequency BestGains = BlockFrequency(0);
2180 MachineBasicBlock *BestPred = nullptr;
2181 for (MachineBasicBlock *Pred : OldTop->predecessors()) {
2182 if (!LoopBlockSet.count(Pred))
2183 continue;
2184 if (Pred == L.getHeader())
2185 continue;
2186 LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
2187 << Pred->succ_size() << " successors, "
2188 << printBlockFreq(MBFI->getMBFI(), *Pred) << " freq\n");
2189 if (Pred->succ_size() > 2)
2190 continue;
2191
2192 MachineBasicBlock *OtherBB = nullptr;
2193 if (Pred->succ_size() == 2) {
2194 OtherBB = *Pred->succ_begin();
2195 if (OtherBB == OldTop)
2196 OtherBB = *Pred->succ_rbegin();
2197 }
2198
2199 if (!canMoveBottomBlockToTop(Pred, OldTop))
2200 continue;
2201
2202 BlockFrequency Gains =
2203 FallThroughGains(Pred, OldTop, OtherBB, LoopBlockSet);
2204 if ((Gains > BlockFrequency(0)) &&
2205 (Gains > BestGains ||
2206 ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
2207 BestPred = Pred;
2208 BestGains = Gains;
2209 }
2210 }
2211
2212 // If no direct predecessor is fine, just use the loop header.
2213 if (!BestPred) {
2214 LLVM_DEBUG(dbgs() << " final top unchanged\n");
2215 return OldTop;
2216 }
2217
2218 // Walk backwards through any straight line of predecessors.
2219 while (BestPred->pred_size() == 1 &&
2220 (*BestPred->pred_begin())->succ_size() == 1 &&
2221 *BestPred->pred_begin() != L.getHeader())
2222 BestPred = *BestPred->pred_begin();
2223
2224 LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2225 return BestPred;
2226}
2227
2228/// Find the best loop top block for layout.
2229///
2230/// This function iteratively calls findBestLoopTopHelper, until no new better
2231/// BB can be found.
2232MachineBasicBlock *
2233MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2234 const BlockFilterSet &LoopBlockSet) {
2235 // Placing the latch block before the header may introduce an extra branch
2236 // that skips this block the first time the loop is executed, which we want
2237 // to avoid when optimising for size.
2238 // FIXME: in theory there is a case that does not introduce a new branch,
2239 // i.e. when the layout predecessor does not fallthrough to the loop header.
2240 // In practice this never happens though: there always seems to be a preheader
2241 // that can fallthrough and that is also placed before the header.
2242 if (llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get()))
2243 return L.getHeader();
2244
2245 MachineBasicBlock *OldTop = nullptr;
2246 MachineBasicBlock *NewTop = L.getHeader();
2247 while (NewTop != OldTop) {
2248 OldTop = NewTop;
2249 NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2250 if (NewTop != OldTop)
2251 ComputedEdges[NewTop] = {OldTop, false};
2252 }
2253 return NewTop;
2254}
2255
2256/// Find the best loop exiting block for layout.
2257///
2258/// This routine implements the logic to analyze the loop looking for the best
2259/// block to layout at the top of the loop. Typically this is done to maximize
2260/// fallthrough opportunities.
2261MachineBasicBlock *
2262MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2263 const BlockFilterSet &LoopBlockSet,
2264 BlockFrequency &ExitFreq) {
2265 // We don't want to layout the loop linearly in all cases. If the loop header
2266 // is just a normal basic block in the loop, we want to look for what block
2267 // within the loop is the best one to layout at the top. However, if the loop
2268 // header has be pre-merged into a chain due to predecessors not having
2269 // analyzable branches, *and* the predecessor it is merged with is *not* part
2270 // of the loop, rotating the header into the middle of the loop will create
2271 // a non-contiguous range of blocks which is Very Bad. So start with the
2272 // header and only rotate if safe.
2273 BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2274 if (!LoopBlockSet.count(*HeaderChain.begin()))
2275 return nullptr;
2276
2277 BlockFrequency BestExitEdgeFreq;
2278 unsigned BestExitLoopDepth = 0;
2279 MachineBasicBlock *ExitingBB = nullptr;
2280 // If there are exits to outer loops, loop rotation can severely limit
2281 // fallthrough opportunities unless it selects such an exit. Keep a set of
2282 // blocks where rotating to exit with that block will reach an outer loop.
2283 SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2284
2285 LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2286 << getBlockName(L.getHeader()) << "\n");
2287 for (MachineBasicBlock *MBB : L.getBlocks()) {
2288 BlockChain &Chain = *BlockToChain[MBB];
2289 // Ensure that this block is at the end of a chain; otherwise it could be
2290 // mid-way through an inner loop or a successor of an unanalyzable branch.
2291 if (MBB != *std::prev(Chain.end()))
2292 continue;
2293
2294 // Now walk the successors. We need to establish whether this has a viable
2295 // exiting successor and whether it has a viable non-exiting successor.
2296 // We store the old exiting state and restore it if a viable looping
2297 // successor isn't found.
2298 MachineBasicBlock *OldExitingBB = ExitingBB;
2299 BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2300 bool HasLoopingSucc = false;
2301 for (MachineBasicBlock *Succ : MBB->successors()) {
2302 if (Succ->isEHPad())
2303 continue;
2304 if (Succ == MBB)
2305 continue;
2306 BlockChain &SuccChain = *BlockToChain[Succ];
2307 // Don't split chains, either this chain or the successor's chain.
2308 if (&Chain == &SuccChain) {
2309 LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2310 << getBlockName(Succ) << " (chain conflict)\n");
2311 continue;
2312 }
2313
2314 auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2315 if (LoopBlockSet.count(Succ)) {
2316 LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2317 << getBlockName(Succ) << " (" << SuccProb << ")\n");
2318 HasLoopingSucc = true;
2319 continue;
2320 }
2321
2322 unsigned SuccLoopDepth = 0;
2323 if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2324 SuccLoopDepth = ExitLoop->getLoopDepth();
2325 if (ExitLoop->contains(&L))
2326 BlocksExitingToOuterLoop.insert(MBB);
2327 }
2328
2329 BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2330 LLVM_DEBUG(
2331 dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2332 << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("
2333 << printBlockFreq(MBFI->getMBFI(), ExitEdgeFreq) << ")\n");
2334 // Note that we bias this toward an existing layout successor to retain
2335 // incoming order in the absence of better information. The exit must have
2336 // a frequency higher than the current exit before we consider breaking
2337 // the layout.
2338 BranchProbability Bias(100 - ExitBlockBias, 100);
2339 if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2340 ExitEdgeFreq > BestExitEdgeFreq ||
2341 (MBB->isLayoutSuccessor(Succ) &&
2342 !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2343 BestExitEdgeFreq = ExitEdgeFreq;
2344 ExitingBB = MBB;
2345 }
2346 }
2347
2348 if (!HasLoopingSucc) {
2349 // Restore the old exiting state, no viable looping successor was found.
2350 ExitingBB = OldExitingBB;
2351 BestExitEdgeFreq = OldBestExitEdgeFreq;
2352 }
2353 }
2354 // Without a candidate exiting block or with only a single block in the
2355 // loop, just use the loop header to layout the loop.
2356 if (!ExitingBB) {
2357 LLVM_DEBUG(
2358 dbgs() << " No other candidate exit blocks, using loop header\n");
2359 return nullptr;
2360 }
2361 if (L.getNumBlocks() == 1) {
2362 LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2363 return nullptr;
2364 }
2365
2366 // Also, if we have exit blocks which lead to outer loops but didn't select
2367 // one of them as the exiting block we are rotating toward, disable loop
2368 // rotation altogether.
2369 if (!BlocksExitingToOuterLoop.empty() &&
2370 !BlocksExitingToOuterLoop.count(ExitingBB))
2371 return nullptr;
2372
2373 LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2374 << "\n");
2375 ExitFreq = BestExitEdgeFreq;
2376 return ExitingBB;
2377}
2378
2379/// Check if there is a fallthrough to loop header Top.
2380///
2381/// 1. Look for a Pred that can be layout before Top.
2382/// 2. Check if Top is the most possible successor of Pred.
2383bool MachineBlockPlacement::hasViableTopFallthrough(
2384 const MachineBasicBlock *Top, const BlockFilterSet &LoopBlockSet) {
2385 for (MachineBasicBlock *Pred : Top->predecessors()) {
2386 BlockChain *PredChain = BlockToChain[Pred];
2387 if (!LoopBlockSet.count(Pred) &&
2388 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2389 // Found a Pred block can be placed before Top.
2390 // Check if Top is the best successor of Pred.
2391 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2392 bool TopOK = true;
2393 for (MachineBasicBlock *Succ : Pred->successors()) {
2394 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2395 BlockChain *SuccChain = BlockToChain[Succ];
2396 // Check if Succ can be placed after Pred.
2397 // Succ should not be in any chain, or it is the head of some chain.
2398 if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2399 TopOK = false;
2400 break;
2401 }
2402 }
2403 if (TopOK)
2404 return true;
2405 }
2406 }
2407 return false;
2408}
2409
2410/// Attempt to rotate an exiting block to the bottom of the loop.
2411///
2412/// Once we have built a chain, try to rotate it to line up the hot exit block
2413/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2414/// branches. For example, if the loop has fallthrough into its header and out
2415/// of its bottom already, don't rotate it.
2416void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2417 const MachineBasicBlock *ExitingBB,
2418 BlockFrequency ExitFreq,
2419 const BlockFilterSet &LoopBlockSet) {
2420 if (!ExitingBB)
2421 return;
2422
2423 MachineBasicBlock *Top = *LoopChain.begin();
2424 MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2425
2426 // If ExitingBB is already the last one in a chain then nothing to do.
2427 if (Bottom == ExitingBB)
2428 return;
2429
2430 // The entry block should always be the first BB in a function.
2431 if (Top->isEntryBlock())
2432 return;
2433
2434 bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2435
2436 // If the header has viable fallthrough, check whether the current loop
2437 // bottom is a viable exiting block. If so, bail out as rotating will
2438 // introduce an unnecessary branch.
2439 if (ViableTopFallthrough) {
2440 for (MachineBasicBlock *Succ : Bottom->successors()) {
2441 BlockChain *SuccChain = BlockToChain[Succ];
2442 if (!LoopBlockSet.count(Succ) &&
2443 (!SuccChain || Succ == *SuccChain->begin()))
2444 return;
2445 }
2446
2447 // Rotate will destroy the top fallthrough, we need to ensure the new exit
2448 // frequency is larger than top fallthrough.
2449 BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2450 if (FallThrough2Top >= ExitFreq)
2451 return;
2452 }
2453
2454 BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2455 if (ExitIt == LoopChain.end())
2456 return;
2457
2458 // Rotating a loop exit to the bottom when there is a fallthrough to top
2459 // trades the entry fallthrough for an exit fallthrough.
2460 // If there is no bottom->top edge, but the chosen exit block does have
2461 // a fallthrough, we break that fallthrough for nothing in return.
2462
2463 // Let's consider an example. We have a built chain of basic blocks
2464 // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2465 // By doing a rotation we get
2466 // Bk+1, ..., Bn, B1, ..., Bk
2467 // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2468 // If we had a fallthrough Bk -> Bk+1 it is broken now.
2469 // It might be compensated by fallthrough Bn -> B1.
2470 // So we have a condition to avoid creation of extra branch by loop rotation.
2471 // All below must be true to avoid loop rotation:
2472 // If there is a fallthrough to top (B1)
2473 // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2474 // There is no fallthrough from bottom (Bn) to top (B1).
2475 // Please note that there is no exit fallthrough from Bn because we checked it
2476 // above.
2477 if (ViableTopFallthrough) {
2478 assert(std::next(ExitIt) != LoopChain.end() &&
2479 "Exit should not be last BB");
2480 MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2481 if (ExitingBB->isSuccessor(NextBlockInChain))
2482 if (!Bottom->isSuccessor(Top))
2483 return;
2484 }
2485
2486 LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2487 << " at bottom\n");
2488 std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2489}
2490
2491/// Attempt to rotate a loop based on profile data to reduce branch cost.
2492///
2493/// With profile data, we can determine the cost in terms of missed fall through
2494/// opportunities when rotating a loop chain and select the best rotation.
2495/// Basically, there are three kinds of cost to consider for each rotation:
2496/// 1. The possibly missed fall through edge (if it exists) from BB out of
2497/// the loop to the loop header.
2498/// 2. The possibly missed fall through edges (if they exist) from the loop
2499/// exits to BB out of the loop.
2500/// 3. The missed fall through edge (if it exists) from the last BB to the
2501/// first BB in the loop chain.
2502/// Therefore, the cost for a given rotation is the sum of costs listed above.
2503/// We select the best rotation with the smallest cost.
2504void MachineBlockPlacement::rotateLoopWithProfile(
2505 BlockChain &LoopChain, const MachineLoop &L,
2506 const BlockFilterSet &LoopBlockSet) {
2507 auto RotationPos = LoopChain.end();
2508 MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2509
2510 // The entry block should always be the first BB in a function.
2511 if (ChainHeaderBB->isEntryBlock())
2512 return;
2513
2514 BlockFrequency SmallestRotationCost = BlockFrequency::max();
2515
2516 // A utility lambda that scales up a block frequency by dividing it by a
2517 // branch probability which is the reciprocal of the scale.
2518 auto ScaleBlockFrequency = [](BlockFrequency Freq,
2519 unsigned Scale) -> BlockFrequency {
2520 if (Scale == 0)
2521 return BlockFrequency(0);
2522 // Use operator / between BlockFrequency and BranchProbability to implement
2523 // saturating multiplication.
2524 return Freq / BranchProbability(1, Scale);
2525 };
2526
2527 // Compute the cost of the missed fall-through edge to the loop header if the
2528 // chain head is not the loop header. As we only consider natural loops with
2529 // single header, this computation can be done only once.
2530 BlockFrequency HeaderFallThroughCost(0);
2531 for (auto *Pred : ChainHeaderBB->predecessors()) {
2532 BlockChain *PredChain = BlockToChain[Pred];
2533 if (!LoopBlockSet.count(Pred) &&
2534 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2535 auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2536 MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2537 auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2538 // If the predecessor has only an unconditional jump to the header, we
2539 // need to consider the cost of this jump.
2540 if (Pred->succ_size() == 1)
2541 FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2542 HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2543 }
2544 }
2545
2546 // Here we collect all exit blocks in the loop, and for each exit we find out
2547 // its hottest exit edge. For each loop rotation, we define the loop exit cost
2548 // as the sum of frequencies of exit edges we collect here, excluding the exit
2549 // edge from the tail of the loop chain.
2551 for (auto *BB : LoopChain) {
2552 auto LargestExitEdgeProb = BranchProbability::getZero();
2553 for (auto *Succ : BB->successors()) {
2554 BlockChain *SuccChain = BlockToChain[Succ];
2555 if (!LoopBlockSet.count(Succ) &&
2556 (!SuccChain || Succ == *SuccChain->begin())) {
2557 auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2558 LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2559 }
2560 }
2561 if (LargestExitEdgeProb > BranchProbability::getZero()) {
2562 auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2563 ExitsWithFreq.emplace_back(BB, ExitFreq);
2564 }
2565 }
2566
2567 // In this loop we iterate every block in the loop chain and calculate the
2568 // cost assuming the block is the head of the loop chain. When the loop ends,
2569 // we should have found the best candidate as the loop chain's head.
2570 for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2571 EndIter = LoopChain.end();
2572 Iter != EndIter; Iter++, TailIter++) {
2573 // TailIter is used to track the tail of the loop chain if the block we are
2574 // checking (pointed by Iter) is the head of the chain.
2575 if (TailIter == LoopChain.end())
2576 TailIter = LoopChain.begin();
2577
2578 auto TailBB = *TailIter;
2579
2580 // Calculate the cost by putting this BB to the top.
2581 BlockFrequency Cost = BlockFrequency(0);
2582
2583 // If the current BB is the loop header, we need to take into account the
2584 // cost of the missed fall through edge from outside of the loop to the
2585 // header.
2586 if (Iter != LoopChain.begin())
2587 Cost += HeaderFallThroughCost;
2588
2589 // Collect the loop exit cost by summing up frequencies of all exit edges
2590 // except the one from the chain tail.
2591 for (auto &ExitWithFreq : ExitsWithFreq)
2592 if (TailBB != ExitWithFreq.first)
2593 Cost += ExitWithFreq.second;
2594
2595 // The cost of breaking the once fall-through edge from the tail to the top
2596 // of the loop chain. Here we need to consider three cases:
2597 // 1. If the tail node has only one successor, then we will get an
2598 // additional jmp instruction. So the cost here is (MisfetchCost +
2599 // JumpInstCost) * tail node frequency.
2600 // 2. If the tail node has two successors, then we may still get an
2601 // additional jmp instruction if the layout successor after the loop
2602 // chain is not its CFG successor. Note that the more frequently executed
2603 // jmp instruction will be put ahead of the other one. Assume the
2604 // frequency of those two branches are x and y, where x is the frequency
2605 // of the edge to the chain head, then the cost will be
2606 // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2607 // 3. If the tail node has more than two successors (this rarely happens),
2608 // we won't consider any additional cost.
2609 if (TailBB->isSuccessor(*Iter)) {
2610 auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2611 if (TailBB->succ_size() == 1)
2612 Cost += ScaleBlockFrequency(TailBBFreq, MisfetchCost + JumpInstCost);
2613 else if (TailBB->succ_size() == 2) {
2614 auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2615 auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2616 auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2617 ? TailBBFreq * TailToHeadProb.getCompl()
2618 : TailToHeadFreq;
2619 Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2620 ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2621 }
2622 }
2623
2624 LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2625 << getBlockName(*Iter) << " to the top: "
2626 << printBlockFreq(MBFI->getMBFI(), Cost) << "\n");
2627
2628 if (Cost < SmallestRotationCost) {
2629 SmallestRotationCost = Cost;
2630 RotationPos = Iter;
2631 }
2632 }
2633
2634 if (RotationPos != LoopChain.end()) {
2635 LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2636 << " to the top\n");
2637 std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2638 }
2639}
2640
2641/// Collect blocks in the given loop that are to be placed.
2642///
2643/// When profile data is available, exclude cold blocks from the returned set;
2644/// otherwise, collect all blocks in the loop.
2645MachineBlockPlacement::BlockFilterSet
2646MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2647 // Collect the blocks in a set ordered by block number, as this gives the same
2648 // order as they appear in the function.
2649 struct MBBCompare {
2650 bool operator()(const MachineBasicBlock *X,
2651 const MachineBasicBlock *Y) const {
2652 return X->getNumber() < Y->getNumber();
2653 }
2654 };
2655 std::set<const MachineBasicBlock *, MBBCompare> LoopBlockSet;
2656
2657 // Filter cold blocks off from LoopBlockSet when profile data is available.
2658 // Collect the sum of frequencies of incoming edges to the loop header from
2659 // outside. If we treat the loop as a super block, this is the frequency of
2660 // the loop. Then for each block in the loop, we calculate the ratio between
2661 // its frequency and the frequency of the loop block. When it is too small,
2662 // don't add it to the loop chain. If there are outer loops, then this block
2663 // will be merged into the first outer loop chain for which this block is not
2664 // cold anymore. This needs precise profile data and we only do this when
2665 // profile data is available.
2666 if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
2667 BlockFrequency LoopFreq(0);
2668 for (auto *LoopPred : L.getHeader()->predecessors())
2669 if (!L.contains(LoopPred))
2670 LoopFreq += MBFI->getBlockFreq(LoopPred) *
2671 MBPI->getEdgeProbability(LoopPred, L.getHeader());
2672
2673 for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2674 if (LoopBlockSet.count(LoopBB))
2675 continue;
2676 auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2677 if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2678 continue;
2679 BlockChain *Chain = BlockToChain[LoopBB];
2680 for (MachineBasicBlock *ChainBB : *Chain)
2681 LoopBlockSet.insert(ChainBB);
2682 }
2683 } else
2684 LoopBlockSet.insert(L.block_begin(), L.block_end());
2685
2686 // Copy the blocks into a BlockFilterSet, as iterating it is faster than
2687 // std::set. We will only remove blocks and never insert them, which will
2688 // preserve the ordering.
2689 BlockFilterSet Ret(LoopBlockSet.begin(), LoopBlockSet.end());
2690 return Ret;
2691}
2692
2693/// Forms basic block chains from the natural loop structures.
2694///
2695/// These chains are designed to preserve the existing *structure* of the code
2696/// as much as possible. We can then stitch the chains together in a way which
2697/// both preserves the topological structure and minimizes taken conditional
2698/// branches.
2699void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2700 // First recurse through any nested loops, building chains for those inner
2701 // loops.
2702 for (const MachineLoop *InnerLoop : L)
2703 buildLoopChains(*InnerLoop);
2704
2705 assert(BlockWorkList.empty() &&
2706 "BlockWorkList not empty when starting to build loop chains.");
2707 assert(EHPadWorkList.empty() &&
2708 "EHPadWorkList not empty when starting to build loop chains.");
2709 BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2710
2711 // Check if we have profile data for this function. If yes, we will rotate
2712 // this loop by modeling costs more precisely which requires the profile data
2713 // for better layout.
2714 bool RotateLoopWithProfile =
2716 (PreciseRotationCost && F->getFunction().hasProfileData());
2717
2718 // First check to see if there is an obviously preferable top block for the
2719 // loop. This will default to the header, but may end up as one of the
2720 // predecessors to the header if there is one which will result in strictly
2721 // fewer branches in the loop body.
2722 MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2723
2724 // If we selected just the header for the loop top, look for a potentially
2725 // profitable exit block in the event that rotating the loop can eliminate
2726 // branches by placing an exit edge at the bottom.
2727 //
2728 // Loops are processed innermost to uttermost, make sure we clear
2729 // PreferredLoopExit before processing a new loop.
2730 PreferredLoopExit = nullptr;
2731 BlockFrequency ExitFreq;
2732 if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2733 PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2734
2735 BlockChain &LoopChain = *BlockToChain[LoopTop];
2736
2737 // FIXME: This is a really lame way of walking the chains in the loop: we
2738 // walk the blocks, and use a set to prevent visiting a particular chain
2739 // twice.
2740 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2741 assert(LoopChain.UnscheduledPredecessors == 0 &&
2742 "LoopChain should not have unscheduled predecessors.");
2743 UpdatedPreds.insert(&LoopChain);
2744
2745 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2746 fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2747
2748 buildChain(LoopTop, LoopChain, &LoopBlockSet);
2749
2750 if (RotateLoopWithProfile)
2751 rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2752 else
2753 rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2754
2755 LLVM_DEBUG({
2756 // Crash at the end so we get all of the debugging output first.
2757 bool BadLoop = false;
2758 if (LoopChain.UnscheduledPredecessors) {
2759 BadLoop = true;
2760 dbgs() << "Loop chain contains a block without its preds placed!\n"
2761 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2762 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2763 }
2764 for (MachineBasicBlock *ChainBB : LoopChain) {
2765 dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2766 if (!LoopBlockSet.remove(ChainBB)) {
2767 // We don't mark the loop as bad here because there are real situations
2768 // where this can occur. For example, with an unanalyzable fallthrough
2769 // from a loop block to a non-loop block or vice versa.
2770 dbgs() << "Loop chain contains a block not contained by the loop!\n"
2771 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2772 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2773 << " Bad block: " << getBlockName(ChainBB) << "\n";
2774 }
2775 }
2776
2777 if (!LoopBlockSet.empty()) {
2778 BadLoop = true;
2779 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2780 dbgs() << "Loop contains blocks never placed into a chain!\n"
2781 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2782 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2783 << " Bad block: " << getBlockName(LoopBB) << "\n";
2784 }
2785 assert(!BadLoop && "Detected problems with the placement of this loop.");
2786 });
2787
2788 BlockWorkList.clear();
2789 EHPadWorkList.clear();
2790}
2791
2792void MachineBlockPlacement::buildCFGChains() {
2793 // Ensure that every BB in the function has an associated chain to simplify
2794 // the assumptions of the remaining algorithm.
2795 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2796 for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2797 ++FI) {
2798 MachineBasicBlock *BB = &*FI;
2799 BlockChain *Chain =
2800 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2801 // Also, merge any blocks which we cannot reason about and must preserve
2802 // the exact fallthrough behavior for.
2803 while (true) {
2804 Cond.clear();
2805 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2806 if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2807 break;
2808
2809 MachineFunction::iterator NextFI = std::next(FI);
2810 MachineBasicBlock *NextBB = &*NextFI;
2811 // Ensure that the layout successor is a viable block, as we know that
2812 // fallthrough is a possibility.
2813 assert(NextFI != FE && "Can't fallthrough past the last block.");
2814 LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2815 << getBlockName(BB) << " -> " << getBlockName(NextBB)
2816 << "\n");
2817 Chain->merge(NextBB, nullptr);
2818#ifndef NDEBUG
2819 BlocksWithUnanalyzableExits.insert(&*BB);
2820#endif
2821 FI = NextFI;
2822 BB = NextBB;
2823 }
2824 }
2825
2826 // Build any loop-based chains.
2827 PreferredLoopExit = nullptr;
2828 for (MachineLoop *L : *MLI)
2829 buildLoopChains(*L);
2830
2831 assert(BlockWorkList.empty() &&
2832 "BlockWorkList should be empty before building final chain.");
2833 assert(EHPadWorkList.empty() &&
2834 "EHPadWorkList should be empty before building final chain.");
2835
2836 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2837 for (MachineBasicBlock &MBB : *F)
2838 fillWorkLists(&MBB, UpdatedPreds);
2839
2840 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2841 buildChain(&F->front(), FunctionChain);
2842
2843#ifndef NDEBUG
2844 using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2845#endif
2846 LLVM_DEBUG({
2847 // Crash at the end so we get all of the debugging output first.
2848 bool BadFunc = false;
2849 FunctionBlockSetType FunctionBlockSet;
2850 for (MachineBasicBlock &MBB : *F)
2851 FunctionBlockSet.insert(&MBB);
2852
2853 for (MachineBasicBlock *ChainBB : FunctionChain)
2854 if (!FunctionBlockSet.erase(ChainBB)) {
2855 BadFunc = true;
2856 dbgs() << "Function chain contains a block not in the function!\n"
2857 << " Bad block: " << getBlockName(ChainBB) << "\n";
2858 }
2859
2860 if (!FunctionBlockSet.empty()) {
2861 BadFunc = true;
2862 for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2863 dbgs() << "Function contains blocks never placed into a chain!\n"
2864 << " Bad block: " << getBlockName(RemainingBB) << "\n";
2865 }
2866 assert(!BadFunc && "Detected problems with the block placement.");
2867 });
2868
2869 // Remember original layout ordering, so we can update terminators after
2870 // reordering to point to the original layout successor.
2871 SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
2872 F->getNumBlockIDs());
2873 {
2874 MachineBasicBlock *LastMBB = nullptr;
2875 for (auto &MBB : *F) {
2876 if (LastMBB != nullptr)
2877 OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
2878 LastMBB = &MBB;
2879 }
2880 OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
2881 }
2882
2883 // Splice the blocks into place.
2884 MachineFunction::iterator InsertPos = F->begin();
2885 LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2886 for (MachineBasicBlock *ChainBB : FunctionChain) {
2887 LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2888 : " ... ")
2889 << getBlockName(ChainBB) << "\n");
2890 if (InsertPos != MachineFunction::iterator(ChainBB))
2891 F->splice(InsertPos, ChainBB);
2892 else
2893 ++InsertPos;
2894
2895 // Update the terminator of the previous block.
2896 if (ChainBB == *FunctionChain.begin())
2897 continue;
2898 MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2899
2900 // FIXME: It would be awesome of updateTerminator would just return rather
2901 // than assert when the branch cannot be analyzed in order to remove this
2902 // boiler plate.
2903 Cond.clear();
2904 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2905
2906#ifndef NDEBUG
2907 if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2908 // Given the exact block placement we chose, we may actually not _need_ to
2909 // be able to edit PrevBB's terminator sequence, but not being _able_ to
2910 // do that at this point is a bug.
2911 assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2912 !PrevBB->canFallThrough()) &&
2913 "Unexpected block with un-analyzable fallthrough!");
2914 Cond.clear();
2915 TBB = FBB = nullptr;
2916 }
2917#endif
2918
2919 // The "PrevBB" is not yet updated to reflect current code layout, so,
2920 // o. it may fall-through to a block without explicit "goto" instruction
2921 // before layout, and no longer fall-through it after layout; or
2922 // o. just opposite.
2923 //
2924 // analyzeBranch() may return erroneous value for FBB when these two
2925 // situations take place. For the first scenario FBB is mistakenly set NULL;
2926 // for the 2nd scenario, the FBB, which is expected to be NULL, is
2927 // mistakenly pointing to "*BI".
2928 // Thus, if the future change needs to use FBB before the layout is set, it
2929 // has to correct FBB first by using the code similar to the following:
2930 //
2931 // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2932 // PrevBB->updateTerminator();
2933 // Cond.clear();
2934 // TBB = FBB = nullptr;
2935 // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2936 // // FIXME: This should never take place.
2937 // TBB = FBB = nullptr;
2938 // }
2939 // }
2940 if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2941 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2942 }
2943 }
2944
2945 // Fixup the last block.
2946 Cond.clear();
2947 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2948 if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
2949 MachineBasicBlock *PrevBB = &F->back();
2950 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2951 }
2952
2953 BlockWorkList.clear();
2954 EHPadWorkList.clear();
2955}
2956
2957void MachineBlockPlacement::optimizeBranches() {
2958 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2960
2961 // Now that all the basic blocks in the chain have the proper layout,
2962 // make a final call to analyzeBranch with AllowModify set.
2963 // Indeed, the target may be able to optimize the branches in a way we
2964 // cannot because all branches may not be analyzable.
2965 // E.g., the target may be able to remove an unconditional branch to
2966 // a fallthrough when it occurs after predicated terminators.
2967 for (MachineBasicBlock *ChainBB : FunctionChain) {
2968 Cond.clear();
2969 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2970 if (TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true))
2971 continue;
2972 if (!TBB || !FBB || Cond.empty())
2973 continue;
2974 // If we are optimizing for size we do not consider the runtime performance.
2975 // Instead, we retain the original branch condition so we have more uniform
2976 // instructions which will benefit ICF.
2977 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()))
2978 continue;
2979 // If ChainBB has a two-way branch, try to re-order the branches
2980 // such that we branch to the successor with higher probability first.
2981 if (MBPI->getEdgeProbability(ChainBB, TBB) >=
2982 MBPI->getEdgeProbability(ChainBB, FBB))
2983 continue;
2985 continue;
2986 LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2987 << getBlockName(ChainBB) << "\n");
2988 LLVM_DEBUG(dbgs() << " " << getBlockName(TBB) << " < " << getBlockName(FBB)
2989 << "\n");
2990 auto Dl = ChainBB->findBranchDebugLoc();
2991 TII->removeBranch(*ChainBB);
2992 TII->insertBranch(*ChainBB, FBB, TBB, Cond, Dl);
2993 }
2994}
2995
2996void MachineBlockPlacement::alignBlocks() {
2997 // Walk through the backedges of the function now that we have fully laid out
2998 // the basic blocks and align the destination of each backedge. We don't rely
2999 // exclusively on the loop info here so that we can align backedges in
3000 // unnatural CFGs and backedges that were introduced purely because of the
3001 // loop rotations done during this layout pass.
3003 if (F->getFunction().hasMinSize() ||
3004 (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
3005 return;
3006 }
3007
3008 BlockChain &FunctionChain = *BlockToChain[&F->front()];
3009 // Empty chain.
3010 if (FunctionChain.begin() == FunctionChain.end())
3011 return;
3012
3013 const BranchProbability ColdProb(1, 5); // 20%
3014 BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
3015 BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
3016 for (MachineBasicBlock *ChainBB : FunctionChain) {
3017 if (ChainBB == *FunctionChain.begin())
3018 continue;
3019
3020 // Don't align non-looping basic blocks. These are unlikely to execute
3021 // enough times to matter in practice. Note that we'll still handle
3022 // unnatural CFGs inside of a natural outer loop (the common case) and
3023 // rotated loops.
3024 MachineLoop *L = MLI->getLoopFor(ChainBB);
3025 if (!L)
3026 continue;
3027
3028 const Align TLIAlign = TLI->getPrefLoopAlignment(L);
3029 unsigned MDAlign = 1;
3030 MDNode *LoopID = L->getLoopID();
3031 if (LoopID) {
3032 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
3033 MDNode *MD = dyn_cast<MDNode>(MDO);
3034 if (MD == nullptr)
3035 continue;
3036 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
3037 if (S == nullptr)
3038 continue;
3039 if (S->getString() == "llvm.loop.align") {
3040 assert(MD->getNumOperands() == 2 &&
3041 "per-loop align metadata should have two operands.");
3042 MDAlign =
3043 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
3044 assert(MDAlign >= 1 && "per-loop align value must be positive.");
3045 }
3046 }
3047 }
3048
3049 // Use max of the TLIAlign and MDAlign
3050 const Align LoopAlign = std::max(TLIAlign, Align(MDAlign));
3051 if (LoopAlign == 1)
3052 continue; // Don't care about loop alignment.
3053
3054 // If the block is cold relative to the function entry don't waste space
3055 // aligning it.
3056 BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
3057 if (Freq < WeightedEntryFreq)
3058 continue;
3059
3060 // If the block is cold relative to its loop header, don't align it
3061 // regardless of what edges into the block exist.
3062 MachineBasicBlock *LoopHeader = L->getHeader();
3063 BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
3064 if (Freq < (LoopHeaderFreq * ColdProb))
3065 continue;
3066
3067 // If the global profiles indicates so, don't align it.
3068 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
3069 !TLI->alignLoopsWithOptSize())
3070 continue;
3071
3072 // Check for the existence of a non-layout predecessor which would benefit
3073 // from aligning this block.
3074 MachineBasicBlock *LayoutPred =
3075 &*std::prev(MachineFunction::iterator(ChainBB));
3076
3077 auto DetermineMaxAlignmentPadding = [&]() {
3078 // Set the maximum bytes allowed to be emitted for alignment.
3079 unsigned MaxBytes;
3080 if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
3082 else
3083 MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
3084 ChainBB->setMaxBytesForAlignment(MaxBytes);
3085 };
3086
3087 // Force alignment if all the predecessors are jumps. We already checked
3088 // that the block isn't cold above.
3089 if (!LayoutPred->isSuccessor(ChainBB)) {
3090 ChainBB->setAlignment(LoopAlign);
3091 DetermineMaxAlignmentPadding();
3092 continue;
3093 }
3094
3095 // Align this block if the layout predecessor's edge into this block is
3096 // cold relative to the block. When this is true, other predecessors make up
3097 // all of the hot entries into the block and thus alignment is likely to be
3098 // important.
3099 BranchProbability LayoutProb =
3100 MBPI->getEdgeProbability(LayoutPred, ChainBB);
3101 BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
3102 if (LayoutEdgeFreq <= (Freq * ColdProb)) {
3103 ChainBB->setAlignment(LoopAlign);
3104 DetermineMaxAlignmentPadding();
3105 }
3106 }
3107
3108 const bool HasMaxBytesOverride =
3109 MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
3110
3111 if (AlignAllBlock)
3112 // Align all of the blocks in the function to a specific alignment.
3113 for (MachineBasicBlock &MBB : *F) {
3114 if (HasMaxBytesOverride)
3117 else
3119 }
3120 else if (AlignAllNonFallThruBlocks) {
3121 // Align all of the blocks that have no fall-through predecessors to a
3122 // specific alignment.
3123 for (auto MBI = std::next(F->begin()), MBE = F->end(); MBI != MBE; ++MBI) {
3124 auto LayoutPred = std::prev(MBI);
3125 if (!LayoutPred->isSuccessor(&*MBI)) {
3126 if (HasMaxBytesOverride)
3127 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
3129 else
3130 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
3131 }
3132 }
3133 }
3134}
3135
3136/// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
3137/// it was duplicated into its chain predecessor and removed.
3138/// \p BB - Basic block that may be duplicated.
3139///
3140/// \p LPred - Chosen layout predecessor of \p BB.
3141/// Updated to be the chain end if LPred is removed.
3142/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3143/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3144/// Used to identify which blocks to update predecessor
3145/// counts.
3146/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3147/// chosen in the given order due to unnatural CFG
3148/// only needed if \p BB is removed and
3149/// \p PrevUnplacedBlockIt pointed to \p BB.
3150/// @return true if \p BB was removed.
3151bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
3152 MachineBasicBlock *BB, MachineBasicBlock *&LPred,
3153 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
3154 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3155 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt) {
3156 bool Removed, DuplicatedToLPred;
3157 bool DuplicatedToOriginalLPred;
3158 Removed = maybeTailDuplicateBlock(
3159 BB, LPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3160 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3161 if (!Removed)
3162 return false;
3163 DuplicatedToOriginalLPred = DuplicatedToLPred;
3164 // Iteratively try to duplicate again. It can happen that a block that is
3165 // duplicated into is still small enough to be duplicated again.
3166 // No need to call markBlockSuccessors in this case, as the blocks being
3167 // duplicated from here on are already scheduled.
3168 while (DuplicatedToLPred && Removed) {
3169 MachineBasicBlock *DupBB, *DupPred;
3170 // The removal callback causes Chain.end() to be updated when a block is
3171 // removed. On the first pass through the loop, the chain end should be the
3172 // same as it was on function entry. On subsequent passes, because we are
3173 // duplicating the block at the end of the chain, if it is removed the
3174 // chain will have shrunk by one block.
3175 BlockChain::iterator ChainEnd = Chain.end();
3176 DupBB = *(--ChainEnd);
3177 // Now try to duplicate again.
3178 if (ChainEnd == Chain.begin())
3179 break;
3180 DupPred = *std::prev(ChainEnd);
3181 Removed = maybeTailDuplicateBlock(
3182 DupBB, DupPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3183 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3184 }
3185 // If BB was duplicated into LPred, it is now scheduled. But because it was
3186 // removed, markChainSuccessors won't be called for its chain. Instead we
3187 // call markBlockSuccessors for LPred to achieve the same effect. This must go
3188 // at the end because repeating the tail duplication can increase the number
3189 // of unscheduled predecessors.
3190 LPred = *std::prev(Chain.end());
3191 if (DuplicatedToOriginalLPred)
3192 markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
3193 return true;
3194}
3195
3196/// Tail duplicate \p BB into (some) predecessors if profitable.
3197/// \p BB - Basic block that may be duplicated
3198/// \p LPred - Chosen layout predecessor of \p BB
3199/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3200/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3201/// Used to identify which blocks to update predecessor
3202/// counts.
3203/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3204/// chosen in the given order due to unnatural CFG
3205/// only needed if \p BB is removed and
3206/// \p PrevUnplacedBlockIt pointed to \p BB.
3207/// \p DuplicatedToLPred - True if the block was duplicated into LPred.
3208/// \return - True if the block was duplicated into all preds and removed.
3209bool MachineBlockPlacement::maybeTailDuplicateBlock(
3210 MachineBasicBlock *BB, MachineBasicBlock *LPred, BlockChain &Chain,
3211 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3212 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
3213 bool &DuplicatedToLPred) {
3214 DuplicatedToLPred = false;
3215 if (!shouldTailDuplicate(BB))
3216 return false;
3217
3218 LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
3219 << "\n");
3220
3221 // This has to be a callback because none of it can be done after
3222 // BB is deleted.
3223 bool Removed = false;
3224 auto RemovalCallback = [&](MachineBasicBlock *RemBB) {
3225 // Signal to outer function
3226 Removed = true;
3227
3228 // Remove from the Chain and Chain Map
3229 if (auto It = BlockToChain.find(RemBB); It != BlockToChain.end()) {
3230 It->second->remove(RemBB);
3231 BlockToChain.erase(It);
3232 }
3233
3234 // Handle the unplaced block iterator
3235 if (&(*PrevUnplacedBlockIt) == RemBB) {
3236 PrevUnplacedBlockIt++;
3237 }
3238
3239 // Handle the Work Lists
3240 if (RemBB->isEHPad()) {
3241 llvm::erase(EHPadWorkList, RemBB);
3242 } else {
3243 llvm::erase(BlockWorkList, RemBB);
3244 }
3245
3246 // Handle the filter set
3247 if (BlockFilter) {
3248 auto It = llvm::find(*BlockFilter, RemBB);
3249 // Erase RemBB from BlockFilter, and keep PrevUnplacedBlockInFilterIt
3250 // pointing to the same element as before.
3251 if (It != BlockFilter->end()) {
3252 if (It < PrevUnplacedBlockInFilterIt) {
3253 const MachineBasicBlock *PrevBB = *PrevUnplacedBlockInFilterIt;
3254 // BlockFilter is a SmallVector so all elements after RemBB are
3255 // shifted to the front by 1 after its deletion.
3256 auto Distance = PrevUnplacedBlockInFilterIt - It - 1;
3257 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It) + Distance;
3258 assert(*PrevUnplacedBlockInFilterIt == PrevBB);
3259 (void)PrevBB;
3260 } else if (It == PrevUnplacedBlockInFilterIt)
3261 // The block pointed by PrevUnplacedBlockInFilterIt is erased, we
3262 // have to set it to the next element.
3263 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It);
3264 else
3265 BlockFilter->erase(It);
3266 }
3267 }
3268
3269 // Remove the block from loop info.
3270 MLI->removeBlock(RemBB);
3271 if (RemBB == PreferredLoopExit)
3272 PreferredLoopExit = nullptr;
3273
3274 LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: " << getBlockName(RemBB)
3275 << "\n");
3276 };
3277 auto RemovalCallbackRef =
3278 function_ref<void(MachineBasicBlock *)>(RemovalCallback);
3279
3280 SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
3281 bool IsSimple = TailDup.isSimpleBB(BB);
3282 SmallVector<MachineBasicBlock *, 8> CandidatePreds;
3283 SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
3284 if (F->getFunction().hasProfileData()) {
3285 // We can do partial duplication with precise profile information.
3286 findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
3287 if (CandidatePreds.size() == 0)
3288 return false;
3289 if (CandidatePreds.size() < BB->pred_size())
3290 CandidatePtr = &CandidatePreds;
3291 }
3292 TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
3293 &RemovalCallbackRef, CandidatePtr);
3294
3295 // Update UnscheduledPredecessors to reflect tail-duplication.
3296 DuplicatedToLPred = false;
3297 for (MachineBasicBlock *Pred : DuplicatedPreds) {
3298 // We're only looking for unscheduled predecessors that match the filter.
3299 BlockChain *PredChain = BlockToChain[Pred];
3300 if (Pred == LPred)
3301 DuplicatedToLPred = true;
3302 if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred)) ||
3303 PredChain == &Chain)
3304 continue;
3305 for (MachineBasicBlock *NewSucc : Pred->successors()) {
3306 if (BlockFilter && !BlockFilter->count(NewSucc))
3307 continue;
3308 BlockChain *NewChain = BlockToChain[NewSucc];
3309 if (NewChain != &Chain && NewChain != PredChain)
3310 NewChain->UnscheduledPredecessors++;
3311 }
3312 }
3313 return Removed;
3314}
3315
3316// Count the number of actual machine instructions.
3318 uint64_t InstrCount = 0;
3319 for (MachineInstr &MI : *MBB) {
3320 if (!MI.isPHI() && !MI.isMetaInstruction())
3321 InstrCount += 1;
3322 }
3323 return InstrCount;
3324}
3325
3326// The size cost of duplication is the instruction size of the duplicated block.
3327// So we should scale the threshold accordingly. But the instruction size is not
3328// available on all targets, so we use the number of instructions instead.
3329BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3330 return BlockFrequency(DupThreshold.getFrequency() * countMBBInstruction(BB));
3331}
3332
3333// Returns true if BB is Pred's best successor.
3334bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3335 MachineBasicBlock *Pred,
3336 BlockFilterSet *BlockFilter) {
3337 if (BB == Pred)
3338 return false;
3339 if (BlockFilter && !BlockFilter->count(Pred))
3340 return false;
3341 BlockChain *PredChain = BlockToChain[Pred];
3342 if (PredChain && (Pred != *std::prev(PredChain->end())))
3343 return false;
3344
3345 // Find the successor with largest probability excluding BB.
3346 BranchProbability BestProb = BranchProbability::getZero();
3347 for (MachineBasicBlock *Succ : Pred->successors())
3348 if (Succ != BB) {
3349 if (BlockFilter && !BlockFilter->count(Succ))
3350 continue;
3351 BlockChain *SuccChain = BlockToChain[Succ];
3352 if (SuccChain && (Succ != *SuccChain->begin()))
3353 continue;
3354 BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
3355 if (SuccProb > BestProb)
3356 BestProb = SuccProb;
3357 }
3358
3359 BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
3360 if (BBProb <= BestProb)
3361 return false;
3362
3363 // Compute the number of reduced taken branches if Pred falls through to BB
3364 // instead of another successor. Then compare it with threshold.
3365 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3366 BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3367 return Gain > scaleThreshold(BB);
3368}
3369
3370// Find out the predecessors of BB and BB can be beneficially duplicated into
3371// them.
3372void MachineBlockPlacement::findDuplicateCandidates(
3373 SmallVectorImpl<MachineBasicBlock *> &Candidates, MachineBasicBlock *BB,
3374 BlockFilterSet *BlockFilter) {
3375 MachineBasicBlock *Fallthrough = nullptr;
3376 BranchProbability DefaultBranchProb = BranchProbability::getZero();
3377 BlockFrequency BBDupThreshold(scaleThreshold(BB));
3378 SmallVector<MachineBasicBlock *, 8> Preds(BB->predecessors());
3379 SmallVector<MachineBasicBlock *, 8> Succs(BB->successors());
3380
3381 // Sort for highest frequency.
3382 auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3383 return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
3384 };
3385 auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3386 return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
3387 };
3388 llvm::stable_sort(Succs, CmpSucc);
3389 llvm::stable_sort(Preds, CmpPred);
3390
3391 auto SuccIt = Succs.begin();
3392 if (SuccIt != Succs.end()) {
3393 DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
3394 }
3395
3396 // For each predecessors of BB, compute the benefit of duplicating BB,
3397 // if it is larger than the threshold, add it into Candidates.
3398 //
3399 // If we have following control flow.
3400 //
3401 // PB1 PB2 PB3 PB4
3402 // \ | / /\
3403 // \ | / / \
3404 // \ |/ / \
3405 // BB----/ OB
3406 // /\
3407 // / \
3408 // SB1 SB2
3409 //
3410 // And it can be partially duplicated as
3411 //
3412 // PB2+BB
3413 // | PB1 PB3 PB4
3414 // | | / /\
3415 // | | / / \
3416 // | |/ / \
3417 // | BB----/ OB
3418 // |\ /|
3419 // | X |
3420 // |/ \|
3421 // SB2 SB1
3422 //
3423 // The benefit of duplicating into a predecessor is defined as
3424 // Orig_taken_branch - Duplicated_taken_branch
3425 //
3426 // The Orig_taken_branch is computed with the assumption that predecessor
3427 // jumps to BB and the most possible successor is laid out after BB.
3428 //
3429 // The Duplicated_taken_branch is computed with the assumption that BB is
3430 // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3431 // SB2 for PB2 in our case). If there is no available successor, the combined
3432 // block jumps to all BB's successor, like PB3 in this example.
3433 //
3434 // If a predecessor has multiple successors, so BB can't be duplicated into
3435 // it. But it can beneficially fall through to BB, and duplicate BB into other
3436 // predecessors.
3437 for (MachineBasicBlock *Pred : Preds) {
3438 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3439
3440 if (!TailDup.canTailDuplicate(BB, Pred)) {
3441 // BB can't be duplicated into Pred, but it is possible to be layout
3442 // below Pred.
3443 if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3444 Fallthrough = Pred;
3445 if (SuccIt != Succs.end())
3446 SuccIt++;
3447 }
3448 continue;
3449 }
3450
3451 BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3452 BlockFrequency DupCost;
3453 if (SuccIt == Succs.end()) {
3454 // Jump to all successors;
3455 if (Succs.size() > 0)
3456 DupCost += PredFreq;
3457 } else {
3458 // Fallthrough to *SuccIt, jump to all other successors;
3459 DupCost += PredFreq;
3460 DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
3461 }
3462
3463 assert(OrigCost >= DupCost);
3464 OrigCost -= DupCost;
3465 if (OrigCost > BBDupThreshold) {
3466 Candidates.push_back(Pred);
3467 if (SuccIt != Succs.end())
3468 SuccIt++;
3469 }
3470 }
3471
3472 // No predecessors can optimally fallthrough to BB.
3473 // So we can change one duplication into fallthrough.
3474 if (!Fallthrough) {
3475 if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
3476 Candidates[0] = Candidates.back();
3477 Candidates.pop_back();
3478 }
3479 }
3480}
3481
3482void MachineBlockPlacement::initTailDupThreshold() {
3483 DupThreshold = BlockFrequency(0);
3484 if (F->getFunction().hasProfileData()) {
3485 // We prefer to use prifile count.
3486 uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
3487 if (HotThreshold != UINT64_MAX) {
3488 UseProfileCount = true;
3489 DupThreshold =
3490 BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100);
3491 } else {
3492 // Profile count is not available, we can use block frequency instead.
3493 BlockFrequency MaxFreq = BlockFrequency(0);
3494 for (MachineBasicBlock &MBB : *F) {
3495 BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
3496 if (Freq > MaxFreq)
3497 MaxFreq = Freq;
3498 }
3499
3500 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
3501 DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
3502 UseProfileCount = false;
3503 }
3504 }
3505
3506 TailDupSize = TailDupPlacementThreshold;
3507 // If only the aggressive threshold is explicitly set, use it.
3508 if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3509 TailDupPlacementThreshold.getNumOccurrences() == 0)
3511
3512 // For aggressive optimization, we can adjust some thresholds to be less
3513 // conservative.
3514 if (OptLevel >= CodeGenOptLevel::Aggressive) {
3515 // At O3 we should be more willing to copy blocks for tail duplication. This
3516 // increases size pressure, so we only do it at O3
3517 // Do this unless only the regular threshold is explicitly set.
3518 if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3519 TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3521 }
3522
3523 // If there's no threshold provided through options, query the target
3524 // information for a threshold instead.
3525 if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
3526 (OptLevel < CodeGenOptLevel::Aggressive ||
3527 TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
3528 TailDupSize = TII->getTailDuplicateSize(OptLevel);
3529}
3530
3531PreservedAnalyses
3534 auto *MBPI = &MFAM.getResult<MachineBranchProbabilityAnalysis>(MF);
3535 auto MBFI = std::make_unique<MBFIWrapper>(
3537 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
3538 auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF)
3540 : nullptr;
3541 auto *MDT = MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF);
3543 .getCachedResult<ProfileSummaryAnalysis>(
3544 *MF.getFunction().getParent());
3545 if (!PSI)
3546 report_fatal_error("MachineBlockPlacement requires ProfileSummaryAnalysis",
3547 false);
3548 MachineBlockPlacement MBP(MBPI, MLI, PSI, std::move(MBFI), MPDT,
3549 AllowTailMerge);
3550
3551 if (MBP.run(MF))
3553
3554 if (MDT)
3555 MDT->updateBlockNumbers();
3556 if (MPDT)
3557 MPDT->updateBlockNumbers();
3558 return PreservedAnalyses::all();
3559}
3560
3562 raw_ostream &OS,
3563 function_ref<StringRef(StringRef)> MapClassName2PassName) const {
3564 OS << MapClassName2PassName(name());
3565 if (!AllowTailMerge)
3566 OS << "<no-tail-merge>";
3567}
3568
3569bool MachineBlockPlacement::run(MachineFunction &MF) {
3570
3571 // Check for single-block functions and skip them.
3572 if (std::next(MF.begin()) == MF.end())
3573 return false;
3574
3575 F = &MF;
3576 OptLevel = F->getTarget().getOptLevel();
3577
3578 TII = MF.getSubtarget().getInstrInfo();
3579 TLI = MF.getSubtarget().getTargetLowering();
3580
3581 // Initialize PreferredLoopExit to nullptr here since it may never be set if
3582 // there are no MachineLoops.
3583 PreferredLoopExit = nullptr;
3584
3585 assert(BlockToChain.empty() &&
3586 "BlockToChain map should be empty before starting placement.");
3587 assert(ComputedEdges.empty() &&
3588 "Computed Edge map should be empty before starting placement.");
3589
3590 // Initialize tail duplication thresholds.
3591 initTailDupThreshold();
3592
3593 const bool OptForSize =
3594 llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
3595 // Determine whether to use ext-tsp for perf/size optimization. The method
3596 // is beneficial only for instances with at least 3 basic blocks and it can be
3597 // disabled for huge functions (exceeding a certain size).
3598 bool UseExtTspForPerf = false;
3599 bool UseExtTspForSize = false;
3600 if (3 <= MF.size() && MF.size() <= ExtTspBlockPlacementMaxBlocks) {
3601 UseExtTspForPerf =
3604 UseExtTspForSize = OptForSize && ApplyExtTspForSize;
3605 }
3606
3607 // Apply tail duplication.
3608 if (allowTailDupPlacement(*F)) {
3609 if (OptForSize)
3610 TailDupSize = 1;
3611 const bool PreRegAlloc = false;
3612 TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
3613 /* LayoutMode */ true, TailDupSize);
3614 if (!UseExtTspForSize)
3615 precomputeTriangleChains();
3616 }
3617
3618 // Run the main block placement.
3619 if (!UseExtTspForSize)
3620 buildCFGChains();
3621
3622 // Changing the layout can create new tail merging opportunities.
3623 // TailMerge can create jump into if branches that make CFG irreducible for
3624 // HW that requires structured CFG.
3625 const bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3626 AllowTailMerge && BranchFoldPlacement &&
3627 MF.size() > 3;
3628 // No tail merging opportunities if the block number is less than four.
3629 if (EnableTailMerge) {
3630 const unsigned TailMergeSize = TailDupSize + 1;
3631 BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
3632 *MBFI, *MBPI, PSI, TailMergeSize);
3633
3634 if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
3635 /*AfterPlacement=*/true)) {
3636 // Must redo the post-dominator tree if blocks were changed.
3637 if (MPDT)
3638 MPDT->recalculate(MF);
3639 if (!UseExtTspForSize) {
3640 // Redo the layout if tail merging creates/removes/moves blocks.
3641 BlockToChain.clear();
3642 ComputedEdges.clear();
3643 ChainAllocator.DestroyAll();
3644 buildCFGChains();
3645 }
3646 }
3647 }
3648
3649 // Apply a post-processing optimizing block placement:
3650 // - find a new placement and modify the layout of the blocks in the function;
3651 // - re-create CFG chains so that we can optimizeBranches and alignBlocks.
3652 if (UseExtTspForPerf || UseExtTspForSize) {
3653 assert(
3654 !(UseExtTspForPerf && UseExtTspForSize) &&
3655 "UseExtTspForPerf and UseExtTspForSize can not be set simultaneously");
3656 applyExtTsp(/*OptForSize=*/UseExtTspForSize);
3657 createCFGChainExtTsp();
3658 }
3659
3660 optimizeBranches();
3661 alignBlocks();
3662
3663 BlockToChain.clear();
3664 ComputedEdges.clear();
3665 ChainAllocator.DestroyAll();
3666
3667 // View the function.
3669 (ViewBlockFreqFuncName.empty() ||
3670 F->getFunction().getName() == ViewBlockFreqFuncName)) {
3672 MF.RenumberBlocks();
3673 MBFI->view("MBP." + MF.getName(), false);
3674 }
3675
3676 // We always return true as we have no way to track whether the final order
3677 // differs from the original order.
3678 return true;
3679}
3680
3681void MachineBlockPlacement::applyExtTsp(bool OptForSize) {
3682 // Prepare data; blocks are indexed by their index in the current ordering.
3683 DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex;
3684 BlockIndex.reserve(F->size());
3685 std::vector<const MachineBasicBlock *> CurrentBlockOrder;
3686 CurrentBlockOrder.reserve(F->size());
3687 size_t NumBlocks = 0;
3688 for (const MachineBasicBlock &MBB : *F) {
3689 BlockIndex[&MBB] = NumBlocks++;
3690 CurrentBlockOrder.push_back(&MBB);
3691 }
3692
3693 SmallVector<uint64_t, 0> BlockCounts(F->size());
3694 SmallVector<uint64_t, 0> BlockSizes(F->size());
3696 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
3698 for (MachineBasicBlock &MBB : *F) {
3699 // Getting the block frequency.
3700 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3701 BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency();
3702 // Getting the block size:
3703 // - approximate the size of an instruction by 4 bytes, and
3704 // - ignore debug instructions.
3705 // Note: getting the exact size of each block is target-dependent and can be
3706 // done by extending the interface of MCCodeEmitter. Experimentally we do
3707 // not see a perf improvement with the exact block sizes.
3708 auto NonDbgInsts =
3710 size_t NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
3711 BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3712
3713 // Getting jump frequencies.
3714 if (OptForSize) {
3715 Cond.clear();
3716 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
3717 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3718 continue;
3719
3720 const MachineBasicBlock *FTB = MBB.getFallThrough();
3721 // Succs is a collection of distinct destinations of the block reachable
3722 // from MBB via a jump instruction; initialize the list using the three
3723 // (non-necessarily distinct) blocks, FTB, TBB, and FBB.
3724 Succs.clear();
3725 if (TBB && TBB != FTB)
3726 Succs.push_back(TBB);
3727 if (FBB && FBB != FTB)
3728 Succs.push_back(FBB);
3729 if (FTB)
3730 Succs.push_back(FTB);
3731 // Absolute magnitude of non-zero counts does not matter for the
3732 // optimization; prioritize slightly jumps with a single successor, since
3733 // the corresponding jump instruction will be removed from the binary.
3734 const uint64_t Freq = Succs.size() == 1 ? 110 : 100;
3735 for (const MachineBasicBlock *Succ : Succs)
3736 JumpCounts.push_back({BlockIndex[&MBB], BlockIndex[Succ], Freq});
3737 } else {
3738 for (MachineBasicBlock *Succ : MBB.successors()) {
3739 auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3740 BlockFrequency JumpFreq = BlockFreq * EP;
3741 JumpCounts.push_back(
3742 {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
3743 }
3744 }
3745 }
3746
3747 LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
3748 << " with profile = " << F->getFunction().hasProfileData()
3749 << " (" << F->getName() << ")" << "\n");
3750
3751 const double OrgScore = calcExtTspScore(BlockSizes, JumpCounts);
3752 LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", OrgScore));
3753
3754 // Run the layout algorithm.
3755 auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
3756 std::vector<const MachineBasicBlock *> NewBlockOrder;
3757 NewBlockOrder.reserve(F->size());
3758 for (uint64_t Node : NewOrder) {
3759 NewBlockOrder.push_back(CurrentBlockOrder[Node]);
3760 }
3761 const double OptScore = calcExtTspScore(NewOrder, BlockSizes, JumpCounts);
3762 LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", OptScore));
3763
3764 // If the optimization is unsuccessful, fall back to the original block order.
3765 if (OptForSize && OrgScore > OptScore)
3766 assignBlockOrder(CurrentBlockOrder);
3767 else
3768 assignBlockOrder(NewBlockOrder);
3769}
3770
3771void MachineBlockPlacement::assignBlockOrder(
3772 const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
3773 assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
3774 F->RenumberBlocks();
3775 // At this point, we possibly removed blocks from the function, so we can't
3776 // renumber the domtree. At this point, we don't need it anymore, though.
3777 // TODO: move this to the point where the dominator tree is actually
3778 // invalidated (i.e., where blocks are removed without updating the domtree).
3779 MPDT = nullptr;
3780
3781 bool HasChanges = false;
3782 for (size_t I = 0; I < NewBlockOrder.size(); I++) {
3783 if (NewBlockOrder[I] != F->getBlockNumbered(I)) {
3784 HasChanges = true;
3785 break;
3786 }
3787 }
3788 // Stop early if the new block order is identical to the existing one.
3789 if (!HasChanges)
3790 return;
3791
3792 SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs());
3793 for (auto &MBB : *F) {
3794 PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
3795 }
3796
3797 // Sort basic blocks in the function according to the computed order.
3798 DenseMap<const MachineBasicBlock *, size_t> NewIndex;
3799 for (const MachineBasicBlock *MBB : NewBlockOrder) {
3800 NewIndex[MBB] = NewIndex.size();
3801 }
3802 F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) {
3803 return NewIndex[&L] < NewIndex[&R];
3804 });
3805
3806 // Update basic block branches by inserting explicit fallthrough branches
3807 // when required and re-optimize branches when possible.
3808 const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
3810 for (auto &MBB : *F) {
3811 MachineFunction::iterator NextMBB = std::next(MBB.getIterator());
3813 auto *FTMBB = PrevFallThroughs[MBB.getNumber()];
3814 // If this block had a fallthrough before we need an explicit unconditional
3815 // branch to that block if the fallthrough block is not adjacent to the
3816 // block in the new order.
3817 if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) {
3818 TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
3819 }
3820
3821 // It might be possible to optimize branches by flipping the condition.
3822 Cond.clear();
3823 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
3824 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3825 continue;
3826 MBB.updateTerminator(FTMBB);
3827 }
3828}
3829
3830void MachineBlockPlacement::createCFGChainExtTsp() {
3831 BlockToChain.clear();
3832 ComputedEdges.clear();
3833 ChainAllocator.DestroyAll();
3834
3835 MachineBasicBlock *HeadBB = &F->front();
3836 BlockChain *FunctionChain =
3837 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB);
3838
3839 for (MachineBasicBlock &MBB : *F) {
3840 if (HeadBB == &MBB)
3841 continue; // Ignore head of the chain
3842 FunctionChain->merge(&MBB, nullptr);
3843 }
3844}
3845
3846namespace {
3847
3848/// A pass to compute block placement statistics.
3849///
3850/// A separate pass to compute interesting statistics for evaluating block
3851/// placement. This is separate from the actual placement pass so that they can
3852/// be computed in the absence of any placement transformations or when using
3853/// alternative placement strategies.
3854class MachineBlockPlacementStats {
3855 /// A handle to the branch probability pass.
3856 const MachineBranchProbabilityInfo *MBPI;
3857
3858 /// A handle to the function-wide block frequency pass.
3859 const MachineBlockFrequencyInfo *MBFI;
3860
3861public:
3862 MachineBlockPlacementStats(const MachineBranchProbabilityInfo *MBPI,
3863 const MachineBlockFrequencyInfo *MBFI)
3864 : MBPI(MBPI), MBFI(MBFI) {}
3865 bool run(MachineFunction &MF);
3866};
3867
3868class MachineBlockPlacementStatsLegacy : public MachineFunctionPass {
3869public:
3870 static char ID; // Pass identification, replacement for typeid
3871
3872 MachineBlockPlacementStatsLegacy() : MachineFunctionPass(ID) {}
3873
3874 bool runOnMachineFunction(MachineFunction &F) override {
3875 auto *MBPI =
3876 &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
3877 auto *MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
3878 return MachineBlockPlacementStats(MBPI, MBFI).run(F);
3879 }
3880
3881 void getAnalysisUsage(AnalysisUsage &AU) const override {
3882 AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
3883 AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
3884 AU.setPreservesAll();
3886 }
3887};
3888
3889} // end anonymous namespace
3890
3891char MachineBlockPlacementStatsLegacy::ID = 0;
3892
3893char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStatsLegacy::ID;
3894
3895INITIALIZE_PASS_BEGIN(MachineBlockPlacementStatsLegacy, "block-placement-stats",
3896 "Basic Block Placement Stats", false, false)
3899INITIALIZE_PASS_END(MachineBlockPlacementStatsLegacy, "block-placement-stats",
3900 "Basic Block Placement Stats", false, false)
3901
3905 auto &MBPI = MFAM.getResult<MachineBranchProbabilityAnalysis>(MF);
3906 auto &MBFI = MFAM.getResult<MachineBlockFrequencyAnalysis>(MF);
3907
3908 MachineBlockPlacementStats(&MBPI, &MBFI).run(MF);
3909 return PreservedAnalyses::all();
3910}
3911
3912bool MachineBlockPlacementStats::run(MachineFunction &F) {
3913 // Check for single-block functions and skip them.
3914 if (std::next(F.begin()) == F.end())
3915 return false;
3916
3917 if (!isFunctionInPrintList(F.getName()))
3918 return false;
3919
3920 for (MachineBasicBlock &MBB : F) {
3921 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3922 Statistic &NumBranches =
3923 (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3924 Statistic &BranchTakenFreq =
3925 (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3926 for (MachineBasicBlock *Succ : MBB.successors()) {
3927 // Skip if this successor is a fallthrough.
3928 if (MBB.isLayoutSuccessor(Succ))
3929 continue;
3930
3931 BlockFrequency EdgeFreq =
3932 BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3933 ++NumBranches;
3934 BranchTakenFreq += EdgeFreq.getFrequency();
3935 }
3936 }
3937
3938 return false;
3939}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
This file defines the BumpPtrAllocator interface.
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares methods and data structures for code layout algorithms.
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
static unsigned InstrCount
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > ExtTspBlockPlacementMaxBlocks("ext-tsp-block-placement-max-blocks", cl::desc("Maximum number of basic blocks in a function to run ext-TSP " "block placement."), cl::init(UINT_MAX), cl::Hidden)
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > PredecessorLimit("block-placement-predecessor-limit", cl::desc("For blocks with more predecessors, certain layout optimizations" "will be disabled to prevent quadratic compile time."), cl::init(1000), cl::Hidden)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, BlockFrequency EntryFreq)
Compare 2 BlockFrequency's with a small penalty for A.
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > MaxBytesForAlignmentOverride("max-bytes-for-alignment", cl::desc("Forces the maximum bytes allowed to be emitted when padding for " "alignment"), cl::init(0), cl::Hidden)
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static cl::opt< unsigned > TailDupProfilePercentThreshold("tail-dup-profile-percent-threshold", cl::desc("If profile count information is used in tail duplication cost " "model, the gained fall through number from tail duplication " "should be at least this percent of hot count."), cl::init(50), cl::Hidden)
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
static cl::opt< bool > ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden, cl::desc("Use ext-tsp for size-aware block placement."))
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock * > &Successors)
Check if BB has exactly the successors in Successors.
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunities in " "outline branches."), cl::init(true), cl::Hidden)
static uint64_t countMBBInstruction(MachineBasicBlock *MBB)
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
static cl::opt< bool > RenumberBlocksBeforeView("renumber-blocks-before-view", cl::desc("If true, basic blocks are re-numbered before MBP layout is printed " "into a dot graph. Only used when a function is being printed."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
#define P(N)
static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
static const char * name
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
static BlockFrequency max()
Returns the maximum possible frequency, the saturation value.
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
static BranchProbability getOne()
uint32_t getNumerator() const
BranchProbability getCompl() const
static BranchProbability getZero()
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:114
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition Function.h:334
Module * getParent()
Get the module that this global value is contained inside of...
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
Remove the branching code at the end of the specific MBB.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
Reverses the branch condition of the specified condition list, returning false on success and true if...
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
Insert branch code into the end of the specified MachineBasicBlock.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
LLVM_ABI StringRef getString() const
Definition Metadata.cpp:624
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
LLVM_ABI void updateTerminator(MachineBasicBlock *PreviousLayoutSuccessor)
Update the terminator instructions in block to account for changes to block layout which may have bee...
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
void setAlignment(Align A)
Set alignment of the basic block.
LLVM_ABI bool isEntryBlock() const
Returns true if this is the entry block of the function.
succ_reverse_iterator succ_rbegin()
LLVM_ABI bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
LLVM_ABI DebugLoc findBranchDebugLoc()
Find and return the merged DebugLoc of the branch instructions of the block.
iterator_range< succ_iterator > successors()
LLVM_ABI bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
iterator_range< pred_iterator > predecessors()
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
LLVM_ABI Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
Analysis pass which computes a MachineDominatorTree.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
unsigned size() const
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Representation of each machine instruction.
Analysis pass that exposes the MachineLoopInfo for a machine function.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
LLVM_ABI uint64_t getOrCompHotCountThreshold() const
Returns HotCountThreshold if set.
typename vector_type::const_iterator iterator
Definition SetVector.h:72
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
reference emplace_back(ArgTypes &&... Args)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition Allocator.h:440
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition Allocator.h:411
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, MBFIWrapper *MBFI, ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock * > *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr, SmallVectorImpl< MachineBasicBlock * > *CandidatePtr=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
bool requiresStructuredCFG() const
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
virtual const TargetLowering * getTargetLowering() const
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an std::string.
#define UINT64_MAX
Definition DataTypes.h:77
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
LLVM_ABI double calcExtTspScore(ArrayRef< uint64_t > Order, ArrayRef< uint64_t > NodeSizes, ArrayRef< EdgeCount > EdgeCounts)
Estimate the "quality" of a given node order in CFG.
LLVM_ABI std::vector< uint64_t > computeExtTspLayout(ArrayRef< uint64_t > NodeSizes, ArrayRef< uint64_t > NodeCounts, ArrayRef< EdgeCount > EdgeCounts)
Find a layout of nodes (basic blocks) of a given CFG optimizing jump locality and thus processor I-ca...
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:667
LLVM_ABI void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition Path.cpp:457
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void stable_sort(R &&Range)
Definition STLExtras.h:2106
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
cl::opt< bool > ApplyExtTspWithoutProfile
constexpr from_range_t from_range
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
cl::opt< unsigned > ProfileLikelyProb
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2190
cl::opt< std::string > ViewBlockFreqFuncName("view-bfi-func-name", cl::Hidden, cl::desc("The option to specify " "the name of the function " "whose CFG will be displayed."))
NoopStatistic Statistic
Definition Statistic.h:162
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
cl::opt< GVDAGType > ViewBlockLayoutWithBFI("view-block-layout-with-bfi", cl::Hidden, cl::desc("Pop up a window to show a dag displaying MBP layout and associated " "block frequencies of the CFG."), cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), clEnumValN(GVDT_Fraction, "fraction", "display a graph using the " "fractional block frequency representation."), clEnumValN(GVDT_Integer, "integer", "display a graph using the raw " "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available.")))
bool isFunctionInPrintList(StringRef FunctionName)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1915
cl::opt< unsigned > StaticLikelyProb
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2182
LLVM_ABI Printable printBlockFreq(const BlockFrequencyInfo &BFI, BlockFrequency Freq)
Print the block frequency Freq relative to the current functions entry frequency.
LLVM_ABI char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
cl::opt< bool > EnableExtTspBlockPlacement
LLVM_ABI char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872