LLVM 23.0.0git
MachineBlockPlacement.cpp
Go to the documentation of this file.
1//===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements basic block placement transformations using the CFG
10// structure and branch probability estimates.
11//
12// The pass strives to preserve the structure of the CFG (that is, retain
13// a topological ordering of basic blocks) in the absence of a *strong* signal
14// to the contrary from probabilities. However, within the CFG structure, it
15// attempts to choose an ordering which favors placing more likely sequences of
16// blocks adjacent to each other.
17//
18// The algorithm works from the inner-most loop within a function outward, and
19// at each stage walks through the basic blocks, trying to coalesce them into
20// sequential chains where allowed by the CFG (or demanded by heavy
21// probabilities). Finally, it walks the blocks in topological order, and the
22// first time it reaches a chain of basic blocks, it schedules them in the
23// function in-order.
24//
25//===----------------------------------------------------------------------===//
26
28#include "BranchFolding.h"
29#include "llvm/ADT/ArrayRef.h"
30#include "llvm/ADT/DenseMap.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
35#include "llvm/ADT/Statistic.h"
52#include "llvm/IR/DebugLoc.h"
53#include "llvm/IR/Function.h"
54#include "llvm/IR/PrintPasses.h"
56#include "llvm/Pass.h"
63#include "llvm/Support/Debug.h"
67#include <algorithm>
68#include <cassert>
69#include <cstdint>
70#include <iterator>
71#include <memory>
72#include <string>
73#include <tuple>
74#include <utility>
75#include <vector>
76
77using namespace llvm;
78
79#define DEBUG_TYPE "block-placement"
80
81STATISTIC(NumCondBranches, "Number of conditional branches");
82STATISTIC(NumUncondBranches, "Number of unconditional branches");
83STATISTIC(CondBranchTakenFreq,
84 "Potential frequency of taking conditional branches");
85STATISTIC(UncondBranchTakenFreq,
86 "Potential frequency of taking unconditional branches");
87
89 "align-all-blocks",
90 cl::desc("Force the alignment of all blocks in the function in log2 format "
91 "(e.g 4 means align on 16B boundaries)."),
93
95 "align-all-nofallthru-blocks",
96 cl::desc("Force the alignment of all blocks that have no fall-through "
97 "predecessors (i.e. don't add nops that are executed). In log2 "
98 "format (e.g 4 means align on 16B boundaries)."),
100
102 "max-bytes-for-alignment",
103 cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
104 "alignment"),
105 cl::init(0), cl::Hidden);
106
108 "block-placement-predecessor-limit",
109 cl::desc("For blocks with more predecessors, certain layout optimizations"
110 "will be disabled to prevent quadratic compile time."),
111 cl::init(1000), cl::Hidden);
112
113// FIXME: Find a good default for this flag and remove the flag.
115 "block-placement-exit-block-bias",
116 cl::desc("Block frequency percentage a loop exit block needs "
117 "over the original exit to be considered the new exit."),
118 cl::init(0), cl::Hidden);
119
120// Definition:
121// - Outlining: placement of a basic block outside the chain or hot path.
122
124 "loop-to-cold-block-ratio",
125 cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
126 "(frequency of block) is greater than this ratio"),
127 cl::init(5), cl::Hidden);
128
129static cl::opt<bool>
130 ForceLoopColdBlock("force-loop-cold-block",
131 cl::desc("Force outlining cold blocks from loops."),
132 cl::init(false), cl::Hidden);
133
134static cl::opt<bool>
135 PreciseRotationCost("precise-rotation-cost",
136 cl::desc("Model the cost of loop rotation more "
137 "precisely by using profile data."),
138 cl::init(false), cl::Hidden);
139
140static cl::opt<bool>
141 ForcePreciseRotationCost("force-precise-rotation-cost",
142 cl::desc("Force the use of precise cost "
143 "loop rotation strategy."),
144 cl::init(false), cl::Hidden);
145
147 "misfetch-cost",
148 cl::desc("Cost that models the probabilistic risk of an instruction "
149 "misfetch due to a jump comparing to falling through, whose cost "
150 "is zero."),
151 cl::init(1), cl::Hidden);
152
153static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
154 cl::desc("Cost of jump instructions."),
155 cl::init(1), cl::Hidden);
156static cl::opt<bool>
157 TailDupPlacement("tail-dup-placement",
158 cl::desc("Perform tail duplication during placement. "
159 "Creates more fallthrough opportunities in "
160 "outline branches."),
161 cl::init(true), cl::Hidden);
162
163static cl::opt<bool>
164 BranchFoldPlacement("branch-fold-placement",
165 cl::desc("Perform branch folding during placement. "
166 "Reduces code size."),
167 cl::init(true), cl::Hidden);
168
169// Heuristic for tail duplication.
171 "tail-dup-placement-threshold",
172 cl::desc("Instruction cutoff for tail duplication during layout. "
173 "Tail merging during layout is forced to have a threshold "
174 "that won't conflict."),
175 cl::init(2), cl::Hidden);
176
177// Heuristic for aggressive tail duplication.
179 "tail-dup-placement-aggressive-threshold",
180 cl::desc("Instruction cutoff for aggressive tail duplication during "
181 "layout. Used at -O3. Tail merging during layout is forced to "
182 "have a threshold that won't conflict."),
183 cl::init(4), cl::Hidden);
184
185// Heuristic for tail duplication.
187 "tail-dup-placement-penalty",
188 cl::desc(
189 "Cost penalty for blocks that can avoid breaking CFG by copying. "
190 "Copying can increase fallthrough, but it also increases icache "
191 "pressure. This parameter controls the penalty to account for that. "
192 "Percent as integer."),
193 cl::init(2), cl::Hidden);
194
195// Heuristic for tail duplication if profile count is used in cost model.
197 "tail-dup-profile-percent-threshold",
198 cl::desc("If profile count information is used in tail duplication cost "
199 "model, the gained fall through number from tail duplication "
200 "should be at least this percent of hot count."),
201 cl::init(50), cl::Hidden);
202
203// Heuristic for triangle chains.
205 "triangle-chain-count",
206 cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
207 "triangle tail duplication heuristic to kick in. 0 to disable."),
208 cl::init(2), cl::Hidden);
209
210// Use case: When block layout is visualized after MBP pass, the basic blocks
211// are labeled in layout order; meanwhile blocks could be numbered in a
212// different order. It's hard to map between the graph and pass output.
213// With this option on, the basic blocks are renumbered in function layout
214// order. For debugging only.
216 "renumber-blocks-before-view",
217 cl::desc(
218 "If true, basic blocks are re-numbered before MBP layout is printed "
219 "into a dot graph. Only used when a function is being printed."),
220 cl::init(false), cl::Hidden);
221
223 "ext-tsp-block-placement-max-blocks",
224 cl::desc("Maximum number of basic blocks in a function to run ext-TSP "
225 "block placement."),
226 cl::init(UINT_MAX), cl::Hidden);
227
228// Apply the ext-tsp algorithm minimizing the size of a binary.
229static cl::opt<bool>
230 ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden,
231 cl::desc("Use ext-tsp for size-aware block placement."));
232
233namespace llvm {
238
239// Internal option used to control BFI display only after MBP pass.
240// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
241// -view-block-layout-with-bfi=
243
244// Command line option to specify the name of the function for CFG dump
245// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
247} // namespace llvm
248
249namespace {
250
251class BlockChain;
252
253/// Type for our function-wide basic block -> block chain mapping.
254using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
255
256/// A chain of blocks which will be laid out contiguously.
257///
258/// This is the datastructure representing a chain of consecutive blocks that
259/// are profitable to layout together in order to maximize fallthrough
260/// probabilities and code locality. We also can use a block chain to represent
261/// a sequence of basic blocks which have some external (correctness)
262/// requirement for sequential layout.
263///
264/// Chains can be built around a single basic block and can be merged to grow
265/// them. They participate in a block-to-chain mapping, which is updated
266/// automatically as chains are merged together.
267class BlockChain {
268 /// The sequence of blocks belonging to this chain.
269 ///
270 /// This is the sequence of blocks for a particular chain. These will be laid
271 /// out in-order within the function.
273
274 /// A handle to the function-wide basic block to block chain mapping.
275 ///
276 /// This is retained in each block chain to simplify the computation of child
277 /// block chains for SCC-formation and iteration. We store the edges to child
278 /// basic blocks, and map them back to their associated chains using this
279 /// structure.
280 BlockToChainMapType &BlockToChain;
281
282public:
283 /// Construct a new BlockChain.
284 ///
285 /// This builds a new block chain representing a single basic block in the
286 /// function. It also registers itself as the chain that block participates
287 /// in with the BlockToChain mapping.
288 BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
289 : Blocks(1, BB), BlockToChain(BlockToChain) {
290 assert(BB && "Cannot create a chain with a null basic block");
291 BlockToChain[BB] = this;
292 }
293
294 /// Iterator over blocks within the chain.
297
298 /// Beginning of blocks within the chain.
299 iterator begin() { return Blocks.begin(); }
300 const_iterator begin() const { return Blocks.begin(); }
301
302 /// End of blocks within the chain.
303 iterator end() { return Blocks.end(); }
304 const_iterator end() const { return Blocks.end(); }
305
306 bool remove(MachineBasicBlock *BB) {
307 for (iterator i = begin(); i != end(); ++i) {
308 if (*i == BB) {
309 Blocks.erase(i);
310 return true;
311 }
312 }
313 return false;
314 }
315
316 /// Merge a block chain into this one.
317 ///
318 /// This routine merges a block chain into this one. It takes care of forming
319 /// a contiguous sequence of basic blocks, updating the edge list, and
320 /// updating the block -> chain mapping. It does not free or tear down the
321 /// old chain, but the old chain's block list is no longer valid.
322 void merge(MachineBasicBlock *BB, BlockChain *Chain) {
323 assert(BB && "Can't merge a null block.");
324 assert(!Blocks.empty() && "Can't merge into an empty chain.");
325
326 // Fast path in case we don't have a chain already.
327 if (!Chain) {
328 assert(!BlockToChain[BB] &&
329 "Passed chain is null, but BB has entry in BlockToChain.");
330 Blocks.push_back(BB);
331 BlockToChain[BB] = this;
332 return;
333 }
334
335 assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
336 assert(Chain->begin() != Chain->end());
337
338 // Update the incoming blocks to point to this chain, and add them to the
339 // chain structure.
340 for (MachineBasicBlock *ChainBB : *Chain) {
341 Blocks.push_back(ChainBB);
342 assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
343 BlockToChain[ChainBB] = this;
344 }
345 }
346
347#ifndef NDEBUG
348 /// Dump the blocks in this chain.
349 LLVM_DUMP_METHOD void dump() {
350 for (MachineBasicBlock *MBB : *this)
351 MBB->dump();
352 }
353#endif // NDEBUG
354
355 /// Count of predecessors of any block within the chain which have not
356 /// yet been scheduled. In general, we will delay scheduling this chain
357 /// until those predecessors are scheduled (or we find a sufficiently good
358 /// reason to override this heuristic.) Note that when forming loop chains,
359 /// blocks outside the loop are ignored and treated as if they were already
360 /// scheduled.
361 ///
362 /// Note: This field is reinitialized multiple times - once for each loop,
363 /// and then once for the function as a whole.
364 unsigned UnscheduledPredecessors = 0;
365};
366
367class MachineBlockPlacement {
368 /// A type for a block filter set.
369 using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
370
371 /// Pair struct containing basic block and taildup profitability
372 struct BlockAndTailDupResult {
373 MachineBasicBlock *BB = nullptr;
374 bool ShouldTailDup;
375 };
376
377 /// Triple struct containing edge weight and the edge.
378 struct WeightedEdge {
379 BlockFrequency Weight;
380 MachineBasicBlock *Src = nullptr;
381 MachineBasicBlock *Dest = nullptr;
382 };
383
384 /// work lists of blocks that are ready to be laid out
387
388 /// Edges that have already been computed as optimal.
389 DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges;
390
391 /// Machine Function
392 MachineFunction *F = nullptr;
393
394 /// A handle to the branch probability pass.
395 const MachineBranchProbabilityInfo *MBPI = nullptr;
396
397 /// A handle to the function-wide block frequency pass.
398 std::unique_ptr<MBFIWrapper> MBFI;
399
400 /// A handle to the loop info.
401 MachineLoopInfo *MLI = nullptr;
402
403 /// Preferred loop exit.
404 /// Member variable for convenience. It may be removed by duplication deep
405 /// in the call stack.
406 MachineBasicBlock *PreferredLoopExit = nullptr;
407
408 /// A handle to the target's instruction info.
409 const TargetInstrInfo *TII = nullptr;
410
411 /// A handle to the target's lowering info.
412 const TargetLoweringBase *TLI = nullptr;
413
414 /// A handle to the post dominator tree.
415 MachinePostDominatorTree *MPDT = nullptr;
416
417 ProfileSummaryInfo *PSI = nullptr;
418
419 // Tail merging is also determined based on
420 // whether structured CFG is required.
421 bool AllowTailMerge;
422
423 CodeGenOptLevel OptLevel;
424
425 /// Duplicator used to duplicate tails during placement.
426 ///
427 /// Placement decisions can open up new tail duplication opportunities, but
428 /// since tail duplication affects placement decisions of later blocks, it
429 /// must be done inline.
430 TailDuplicator TailDup;
431
432 /// Partial tail duplication threshold.
433 BlockFrequency DupThreshold;
434
435 unsigned TailDupSize;
436
437 /// True: use block profile count to compute tail duplication cost.
438 /// False: use block frequency to compute tail duplication cost.
439 bool UseProfileCount = false;
440
441 /// Allocator and owner of BlockChain structures.
442 ///
443 /// We build BlockChains lazily while processing the loop structure of
444 /// a function. To reduce malloc traffic, we allocate them using this
445 /// slab-like allocator, and destroy them after the pass completes. An
446 /// important guarantee is that this allocator produces stable pointers to
447 /// the chains.
448 SpecificBumpPtrAllocator<BlockChain> ChainAllocator;
449
450 /// Function wide BasicBlock to BlockChain mapping.
451 ///
452 /// This mapping allows efficiently moving from any given basic block to the
453 /// BlockChain it participates in, if any. We use it to, among other things,
454 /// allow implicitly defining edges between chains as the existing edges
455 /// between basic blocks.
456 DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChain;
457
458#ifndef NDEBUG
459 /// The set of basic blocks that have terminators that cannot be fully
460 /// analyzed. These basic blocks cannot be re-ordered safely by
461 /// MachineBlockPlacement, and we must preserve physical layout of these
462 /// blocks and their successors through the pass.
463 SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
464#endif
465
466 /// Get block profile count or frequency according to UseProfileCount.
467 /// The return value is used to model tail duplication cost.
468 BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
469 if (UseProfileCount) {
470 auto Count = MBFI->getBlockProfileCount(BB);
471 if (Count)
472 return BlockFrequency(*Count);
473 else
474 return BlockFrequency(0);
475 } else
476 return MBFI->getBlockFreq(BB);
477 }
478
479 /// Scale the DupThreshold according to basic block size.
480 BlockFrequency scaleThreshold(MachineBasicBlock *BB);
481 void initTailDupThreshold();
482
483 /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
484 /// if the count goes to 0, add them to the appropriate work list.
485 void markChainSuccessors(const BlockChain &Chain,
486 const MachineBasicBlock *LoopHeaderBB,
487 const BlockFilterSet *BlockFilter = nullptr);
488
489 /// Decrease the UnscheduledPredecessors count for a single block, and
490 /// if the count goes to 0, add them to the appropriate work list.
491 void markBlockSuccessors(const BlockChain &Chain, const MachineBasicBlock *BB,
492 const MachineBasicBlock *LoopHeaderBB,
493 const BlockFilterSet *BlockFilter = nullptr);
494
495 BranchProbability
496 collectViableSuccessors(const MachineBasicBlock *BB, const BlockChain &Chain,
497 const BlockFilterSet *BlockFilter,
498 SmallVector<MachineBasicBlock *, 4> &Successors);
499 bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
500 BlockFilterSet *BlockFilter);
501 void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
502 MachineBasicBlock *BB,
503 BlockFilterSet *BlockFilter);
504 bool repeatedlyTailDuplicateBlock(
505 MachineBasicBlock *BB, MachineBasicBlock *&LPred,
506 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
507 BlockFilterSet *BlockFilter,
508 MachineFunction::iterator &PrevUnplacedBlockIt,
509 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt);
510 bool
511 maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred,
512 BlockChain &Chain, BlockFilterSet *BlockFilter,
513 MachineFunction::iterator &PrevUnplacedBlockIt,
514 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
515 bool &DuplicatedToLPred);
516 bool hasBetterLayoutPredecessor(const MachineBasicBlock *BB,
517 const MachineBasicBlock *Succ,
518 const BlockChain &SuccChain,
519 BranchProbability SuccProb,
520 BranchProbability RealSuccProb,
521 const BlockChain &Chain,
522 const BlockFilterSet *BlockFilter);
523 BlockAndTailDupResult selectBestSuccessor(const MachineBasicBlock *BB,
524 const BlockChain &Chain,
525 const BlockFilterSet *BlockFilter);
526 MachineBasicBlock *
527 selectBestCandidateBlock(const BlockChain &Chain,
528 SmallVectorImpl<MachineBasicBlock *> &WorkList);
529 MachineBasicBlock *
530 getFirstUnplacedBlock(const BlockChain &PlacedChain,
531 MachineFunction::iterator &PrevUnplacedBlockIt);
532 MachineBasicBlock *
533 getFirstUnplacedBlock(const BlockChain &PlacedChain,
534 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
535 const BlockFilterSet *BlockFilter);
536
537 /// Add a basic block to the work list if it is appropriate.
538 ///
539 /// If the optional parameter BlockFilter is provided, only MBB
540 /// present in the set will be added to the worklist. If nullptr
541 /// is provided, no filtering occurs.
542 void fillWorkLists(const MachineBasicBlock *MBB,
543 SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
544 const BlockFilterSet *BlockFilter);
545
546 void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
547 BlockFilterSet *BlockFilter = nullptr);
548 bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
549 const MachineBasicBlock *OldTop);
550 bool hasViableTopFallthrough(const MachineBasicBlock *Top,
551 const BlockFilterSet &LoopBlockSet);
552 BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
553 const BlockFilterSet &LoopBlockSet);
554 BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
555 const MachineBasicBlock *OldTop,
556 const MachineBasicBlock *ExitBB,
557 const BlockFilterSet &LoopBlockSet);
558 MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
559 const MachineLoop &L,
560 const BlockFilterSet &LoopBlockSet);
561 MachineBasicBlock *findBestLoopTop(const MachineLoop &L,
562 const BlockFilterSet &LoopBlockSet);
563 MachineBasicBlock *findBestLoopExit(const MachineLoop &L,
564 const BlockFilterSet &LoopBlockSet,
565 BlockFrequency &ExitFreq);
566 BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
567 void buildLoopChains(const MachineLoop &L);
568 void rotateLoop(BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
569 BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
570 void rotateLoopWithProfile(BlockChain &LoopChain, const MachineLoop &L,
571 const BlockFilterSet &LoopBlockSet);
572 void buildCFGChains();
573 void optimizeBranches();
574 void alignBlocks();
575 /// Returns true if a block should be tail-duplicated to increase fallthrough
576 /// opportunities.
577 bool shouldTailDuplicate(MachineBasicBlock *BB);
578 /// Check the edge frequencies to see if tail duplication will increase
579 /// fallthroughs.
580 bool isProfitableToTailDup(const MachineBasicBlock *BB,
581 const MachineBasicBlock *Succ,
582 BranchProbability QProb, const BlockChain &Chain,
583 const BlockFilterSet *BlockFilter);
584
585 /// Check for a trellis layout.
586 bool isTrellis(const MachineBasicBlock *BB,
587 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
588 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
589
590 /// Get the best successor given a trellis layout.
591 BlockAndTailDupResult getBestTrellisSuccessor(
592 const MachineBasicBlock *BB,
593 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
594 BranchProbability AdjustedSumProb, const BlockChain &Chain,
595 const BlockFilterSet *BlockFilter);
596
597 /// Get the best pair of non-conflicting edges.
598 static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
599 const MachineBasicBlock *BB,
601
602 /// Returns true if a block can tail duplicate into all unplaced
603 /// predecessors. Filters based on loop.
604 bool canTailDuplicateUnplacedPreds(const MachineBasicBlock *BB,
605 MachineBasicBlock *Succ,
606 const BlockChain &Chain,
607 const BlockFilterSet *BlockFilter);
608
609 /// Find chains of triangles to tail-duplicate where a global analysis works,
610 /// but a local analysis would not find them.
611 void precomputeTriangleChains();
612
613 /// Apply a post-processing step optimizing block placement.
614 void applyExtTsp(bool OptForSize);
615
616 /// Modify the existing block placement in the function and adjust all jumps.
617 void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
618
619 /// Create a single CFG chain from the current block order.
620 void createCFGChainExtTsp();
621
622public:
623 MachineBlockPlacement(const MachineBranchProbabilityInfo *MBPI,
624 MachineLoopInfo *MLI, ProfileSummaryInfo *PSI,
625 std::unique_ptr<MBFIWrapper> MBFI,
626 MachinePostDominatorTree *MPDT, bool AllowTailMerge)
627 : MBPI(MBPI), MBFI(std::move(MBFI)), MLI(MLI), MPDT(MPDT), PSI(PSI),
628 AllowTailMerge(AllowTailMerge) {};
629
630 bool run(MachineFunction &F);
631
632 static bool allowTailDupPlacement(MachineFunction &MF) {
634 }
635};
636
637class MachineBlockPlacementLegacy : public MachineFunctionPass {
638public:
639 static char ID; // Pass identification, replacement for typeid
640
641 MachineBlockPlacementLegacy() : MachineFunctionPass(ID) {
643 }
644
645 bool runOnMachineFunction(MachineFunction &MF) override {
646 if (skipFunction(MF.getFunction()))
647 return false;
648
649 auto *MBPI =
650 &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
651 auto MBFI = std::make_unique<MBFIWrapper>(
652 getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI());
653 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
654 auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF)
655 ? &getAnalysis<MachinePostDominatorTreeWrapperPass>()
656 .getPostDomTree()
657 : nullptr;
658 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
659 auto *PassConfig = &getAnalysis<TargetPassConfig>();
660 bool AllowTailMerge = PassConfig->getEnableTailMerge();
661 return MachineBlockPlacement(MBPI, MLI, PSI, std::move(MBFI), MPDT,
662 AllowTailMerge)
663 .run(MF);
664 }
665
666 void getAnalysisUsage(AnalysisUsage &AU) const override {
667 AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
668 AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
670 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
671 AU.addRequired<MachineLoopInfoWrapperPass>();
672 AU.addRequired<ProfileSummaryInfoWrapperPass>();
673 AU.addRequired<TargetPassConfig>();
675 }
676};
677
678} // end anonymous namespace
679
680char MachineBlockPlacementLegacy::ID = 0;
681
682char &llvm::MachineBlockPlacementID = MachineBlockPlacementLegacy::ID;
683
684INITIALIZE_PASS_BEGIN(MachineBlockPlacementLegacy, DEBUG_TYPE,
685 "Branch Probability Basic Block Placement", false, false)
691INITIALIZE_PASS_END(MachineBlockPlacementLegacy, DEBUG_TYPE,
692 "Branch Probability Basic Block Placement", false, false)
693
694#ifndef NDEBUG
695/// Helper to print the name of a MBB.
696///
697/// Only used by debug logging.
698static std::string getBlockName(const MachineBasicBlock *BB) {
699 std::string Result;
700 raw_string_ostream OS(Result);
701 OS << printMBBReference(*BB);
702 OS << " ('" << BB->getName() << "')";
703 return Result;
704}
705#endif
706
707/// Mark a chain's successors as having one fewer preds.
708///
709/// When a chain is being merged into the "placed" chain, this routine will
710/// quickly walk the successors of each block in the chain and mark them as
711/// having one fewer active predecessor. It also adds any successors of this
712/// chain which reach the zero-predecessor state to the appropriate worklist.
713void MachineBlockPlacement::markChainSuccessors(
714 const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
715 const BlockFilterSet *BlockFilter) {
716 // Walk all the blocks in this chain, marking their successors as having
717 // a predecessor placed.
718 for (MachineBasicBlock *MBB : Chain) {
719 markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
720 }
721}
722
723/// Mark a single block's successors as having one fewer preds.
724///
725/// Under normal circumstances, this is only called by markChainSuccessors,
726/// but if a block that was to be placed is completely tail-duplicated away,
727/// and was duplicated into the chain end, we need to redo markBlockSuccessors
728/// for just that block.
729void MachineBlockPlacement::markBlockSuccessors(
730 const BlockChain &Chain, const MachineBasicBlock *MBB,
731 const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
732 // Add any successors for which this is the only un-placed in-loop
733 // predecessor to the worklist as a viable candidate for CFG-neutral
734 // placement. No subsequent placement of this block will violate the CFG
735 // shape, so we get to use heuristics to choose a favorable placement.
736 for (MachineBasicBlock *Succ : MBB->successors()) {
737 if (BlockFilter && !BlockFilter->count(Succ))
738 continue;
739 BlockChain &SuccChain = *BlockToChain[Succ];
740 // Disregard edges within a fixed chain, or edges to the loop header.
741 if (&Chain == &SuccChain || Succ == LoopHeaderBB)
742 continue;
743
744 // This is a cross-chain edge that is within the loop, so decrement the
745 // loop predecessor count of the destination chain.
746 if (SuccChain.UnscheduledPredecessors == 0 ||
747 --SuccChain.UnscheduledPredecessors > 0)
748 continue;
749
750 auto *NewBB = *SuccChain.begin();
751 if (NewBB->isEHPad())
752 EHPadWorkList.push_back(NewBB);
753 else
754 BlockWorkList.push_back(NewBB);
755 }
756}
757
758/// This helper function collects the set of successors of block
759/// \p BB that are allowed to be its layout successors, and return
760/// the total branch probability of edges from \p BB to those
761/// blocks.
762BranchProbability MachineBlockPlacement::collectViableSuccessors(
763 const MachineBasicBlock *BB, const BlockChain &Chain,
764 const BlockFilterSet *BlockFilter,
765 SmallVector<MachineBasicBlock *, 4> &Successors) {
766 // Adjust edge probabilities by excluding edges pointing to blocks that is
767 // either not in BlockFilter or is already in the current chain. Consider the
768 // following CFG:
769 //
770 // --->A
771 // | / \
772 // | B C
773 // | \ / \
774 // ----D E
775 //
776 // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
777 // A->C is chosen as a fall-through, D won't be selected as a successor of C
778 // due to CFG constraint (the probability of C->D is not greater than
779 // HotProb to break topo-order). If we exclude E that is not in BlockFilter
780 // when calculating the probability of C->D, D will be selected and we
781 // will get A C D B as the layout of this loop.
782 auto AdjustedSumProb = BranchProbability::getOne();
783 for (MachineBasicBlock *Succ : BB->successors()) {
784 bool SkipSucc = false;
785 if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
786 SkipSucc = true;
787 } else {
788 BlockChain *SuccChain = BlockToChain[Succ];
789 if (SuccChain == &Chain) {
790 SkipSucc = true;
791 } else if (Succ != *SuccChain->begin()) {
792 LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
793 << " -> Mid chain!\n");
794 continue;
795 }
796 }
797 if (SkipSucc)
798 AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
799 else
800 Successors.push_back(Succ);
801 }
802
803 return AdjustedSumProb;
804}
805
806/// The helper function returns the branch probability that is adjusted
807/// or normalized over the new total \p AdjustedSumProb.
808static BranchProbability
810 BranchProbability AdjustedSumProb) {
811 BranchProbability SuccProb;
812 uint32_t SuccProbN = OrigProb.getNumerator();
813 uint32_t SuccProbD = AdjustedSumProb.getNumerator();
814 if (SuccProbN >= SuccProbD)
815 SuccProb = BranchProbability::getOne();
816 else
817 SuccProb = BranchProbability(SuccProbN, SuccProbD);
818
819 return SuccProb;
820}
821
822/// Check if \p BB has exactly the successors in \p Successors.
823static bool
826 if (BB.succ_size() != Successors.size())
827 return false;
828 // We don't want to count self-loops
829 if (Successors.count(&BB))
830 return false;
831 for (MachineBasicBlock *Succ : BB.successors())
832 if (!Successors.count(Succ))
833 return false;
834 return true;
835}
836
837/// Check if a block should be tail duplicated to increase fallthrough
838/// opportunities.
839/// \p BB Block to check.
840bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
841 // Blocks with single successors don't create additional fallthrough
842 // opportunities. Don't duplicate them. TODO: When conditional exits are
843 // analyzable, allow them to be duplicated.
844 bool IsSimple = TailDup.isSimpleBB(BB);
845
846 if (BB->succ_size() == 1)
847 return false;
848 return TailDup.shouldTailDuplicate(IsSimple, *BB);
849}
850
851/// Compare 2 BlockFrequency's with a small penalty for \p A.
852/// In order to be conservative, we apply a X% penalty to account for
853/// increased icache pressure and static heuristics. For small frequencies
854/// we use only the numerators to improve accuracy. For simplicity, we assume
855/// the penalty is less than 100%
856/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
858 BlockFrequency EntryFreq) {
859 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
860 BlockFrequency Gain = A - B;
861 return (Gain / ThresholdProb) >= EntryFreq;
862}
863
864/// Check the edge frequencies to see if tail duplication will increase
865/// fallthroughs. It only makes sense to call this function when
866/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
867/// always locally profitable if we would have picked \p Succ without
868/// considering duplication.
869bool MachineBlockPlacement::isProfitableToTailDup(
870 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
871 BranchProbability QProb, const BlockChain &Chain,
872 const BlockFilterSet *BlockFilter) {
873 // We need to do a probability calculation to make sure this is profitable.
874 // First: does succ have a successor that post-dominates? This affects the
875 // calculation. The 2 relevant cases are:
876 // BB BB
877 // | \Qout | \Qout
878 // P| C |P C
879 // = C' = C'
880 // | /Qin | /Qin
881 // | / | /
882 // Succ Succ
883 // / \ | \ V
884 // U/ =V |U \
885 // / \ = D
886 // D E | /
887 // | /
888 // |/
889 // PDom
890 // '=' : Branch taken for that CFG edge
891 // In the second case, Placing Succ while duplicating it into C prevents the
892 // fallthrough of Succ into either D or PDom, because they now have C as an
893 // unplaced predecessor
894
895 // Start by figuring out which case we fall into
896 MachineBasicBlock *PDom = nullptr;
897 SmallVector<MachineBasicBlock *, 4> SuccSuccs;
898 // Only scan the relevant successors
899 auto AdjustedSuccSumProb =
900 collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
901 BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
902 auto BBFreq = MBFI->getBlockFreq(BB);
903 auto SuccFreq = MBFI->getBlockFreq(Succ);
904 BlockFrequency P = BBFreq * PProb;
905 BlockFrequency Qout = BBFreq * QProb;
906 BlockFrequency EntryFreq = MBFI->getEntryFreq();
907 // If there are no more successors, it is profitable to copy, as it strictly
908 // increases fallthrough.
909 if (SuccSuccs.size() == 0)
910 return greaterWithBias(P, Qout, EntryFreq);
911
912 auto BestSuccSucc = BranchProbability::getZero();
913 // Find the PDom or the best Succ if no PDom exists.
914 for (MachineBasicBlock *SuccSucc : SuccSuccs) {
915 auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
916 if (Prob > BestSuccSucc)
917 BestSuccSucc = Prob;
918 if (PDom == nullptr)
919 if (MPDT->dominates(SuccSucc, Succ)) {
920 PDom = SuccSucc;
921 break;
922 }
923 }
924 // For the comparisons, we need to know Succ's best incoming edge that isn't
925 // from BB.
926 auto SuccBestPred = BlockFrequency(0);
927 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
928 if (SuccPred == Succ || SuccPred == BB ||
929 BlockToChain[SuccPred] == &Chain ||
930 (BlockFilter && !BlockFilter->count(SuccPred)))
931 continue;
932 auto Freq =
933 MBFI->getBlockFreq(SuccPred) * MBPI->getEdgeProbability(SuccPred, Succ);
934 if (Freq > SuccBestPred)
935 SuccBestPred = Freq;
936 }
937 // Qin is Succ's best unplaced incoming edge that isn't BB
938 BlockFrequency Qin = SuccBestPred;
939 // If it doesn't have a post-dominating successor, here is the calculation:
940 // BB BB
941 // | \Qout | \
942 // P| C | =
943 // = C' | C
944 // | /Qin | |
945 // | / | C' (+Succ)
946 // Succ Succ /|
947 // / \ | \/ |
948 // U/ =V | == |
949 // / \ | / \|
950 // D E D E
951 // '=' : Branch taken for that CFG edge
952 // Cost in the first case is: P + V
953 // For this calculation, we always assume P > Qout. If Qout > P
954 // The result of this function will be ignored at the caller.
955 // Let F = SuccFreq - Qin
956 // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
957
958 if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
959 BranchProbability UProb = BestSuccSucc;
960 BranchProbability VProb = AdjustedSuccSumProb - UProb;
961 BlockFrequency F = SuccFreq - Qin;
962 BlockFrequency V = SuccFreq * VProb;
963 BlockFrequency QinU = std::min(Qin, F) * UProb;
964 BlockFrequency BaseCost = P + V;
965 BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
966 return greaterWithBias(BaseCost, DupCost, EntryFreq);
967 }
968 BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
969 BranchProbability VProb = AdjustedSuccSumProb - UProb;
970 BlockFrequency U = SuccFreq * UProb;
971 BlockFrequency V = SuccFreq * VProb;
972 BlockFrequency F = SuccFreq - Qin;
973 // If there is a post-dominating successor, here is the calculation:
974 // BB BB BB BB
975 // | \Qout | \ | \Qout | \
976 // |P C | = |P C | =
977 // = C' |P C = C' |P C
978 // | /Qin | | | /Qin | |
979 // | / | C' (+Succ) | / | C' (+Succ)
980 // Succ Succ /| Succ Succ /|
981 // | \ V | \/ | | \ V | \/ |
982 // |U \ |U /\ =? |U = |U /\ |
983 // = D = = =?| | D | = =|
984 // | / |/ D | / |/ D
985 // | / | / | = | /
986 // |/ | / |/ | =
987 // Dom Dom Dom Dom
988 // '=' : Branch taken for that CFG edge
989 // The cost for taken branches in the first case is P + U
990 // Let F = SuccFreq - Qin
991 // The cost in the second case (assuming independence), given the layout:
992 // BB, Succ, (C+Succ), D, Dom or the layout:
993 // BB, Succ, D, Dom, (C+Succ)
994 // is Qout + max(F, Qin) * U + min(F, Qin)
995 // compare P + U vs Qout + P * U + Qin.
996 //
997 // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
998 //
999 // For the 3rd case, the cost is P + 2 * V
1000 // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
1001 // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
1002 if (UProb > AdjustedSuccSumProb / 2 &&
1003 !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
1004 Chain, BlockFilter))
1005 // Cases 3 & 4
1006 return greaterWithBias(
1007 (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
1008 EntryFreq);
1009 // Cases 1 & 2
1010 return greaterWithBias((P + U),
1011 (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
1012 std::max(Qin, F) * UProb),
1013 EntryFreq);
1014}
1015
1016/// Check for a trellis layout. \p BB is the upper part of a trellis if its
1017/// successors form the lower part of a trellis. A successor set S forms the
1018/// lower part of a trellis if all of the predecessors of S are either in S or
1019/// have all of S as successors. We ignore trellises where BB doesn't have 2
1020/// successors because for fewer than 2, it's trivial, and for 3 or greater they
1021/// are very uncommon and complex to compute optimally. Allowing edges within S
1022/// is not strictly a trellis, but the same algorithm works, so we allow it.
1023bool MachineBlockPlacement::isTrellis(
1024 const MachineBasicBlock *BB,
1025 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1026 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1027 // Technically BB could form a trellis with branching factor higher than 2.
1028 // But that's extremely uncommon.
1029 if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
1030 return false;
1031
1032 SmallPtrSet<const MachineBasicBlock *, 2> Successors(llvm::from_range,
1033 BB->successors());
1034 // To avoid reviewing the same predecessors twice.
1035 SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;
1036
1037 for (MachineBasicBlock *Succ : ViableSuccs) {
1038 // Compile-time optimization: runtime is quadratic in the number of
1039 // predecessors. For such uncommon cases, exit early.
1040 if (Succ->pred_size() > PredecessorLimit)
1041 return false;
1042
1043 int PredCount = 0;
1044 for (auto *SuccPred : Succ->predecessors()) {
1045 // Allow triangle successors, but don't count them.
1046 if (Successors.count(SuccPred)) {
1047 // Make sure that it is actually a triangle.
1048 for (MachineBasicBlock *CheckSucc : SuccPred->successors())
1049 if (!Successors.count(CheckSucc))
1050 return false;
1051 continue;
1052 }
1053 const BlockChain *PredChain = BlockToChain[SuccPred];
1054 if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
1055 PredChain == &Chain || PredChain == BlockToChain[Succ])
1056 continue;
1057 ++PredCount;
1058 // Perform the successor check only once.
1059 if (!SeenPreds.insert(SuccPred).second)
1060 continue;
1061 if (!hasSameSuccessors(*SuccPred, Successors))
1062 return false;
1063 }
1064 // If one of the successors has only BB as a predecessor, it is not a
1065 // trellis.
1066 if (PredCount < 1)
1067 return false;
1068 }
1069 return true;
1070}
1071
1072/// Pick the highest total weight pair of edges that can both be laid out.
1073/// The edges in \p Edges[0] are assumed to have a different destination than
1074/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
1075/// the individual highest weight edges to the 2 different destinations, or in
1076/// case of a conflict, one of them should be replaced with a 2nd best edge.
1077std::pair<MachineBlockPlacement::WeightedEdge,
1078 MachineBlockPlacement::WeightedEdge>
1079MachineBlockPlacement::getBestNonConflictingEdges(
1080 const MachineBasicBlock *BB,
1082 Edges) {
1083 // Sort the edges, and then for each successor, find the best incoming
1084 // predecessor. If the best incoming predecessors aren't the same,
1085 // then that is clearly the best layout. If there is a conflict, one of the
1086 // successors will have to fallthrough from the second best predecessor. We
1087 // compare which combination is better overall.
1088
1089 // Sort for highest frequency.
1090 auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
1091
1092 llvm::stable_sort(Edges[0], Cmp);
1093 llvm::stable_sort(Edges[1], Cmp);
1094 auto BestA = Edges[0].begin();
1095 auto BestB = Edges[1].begin();
1096 // Arrange for the correct answer to be in BestA and BestB
1097 // If the 2 best edges don't conflict, the answer is already there.
1098 if (BestA->Src == BestB->Src) {
1099 // Compare the total fallthrough of (Best + Second Best) for both pairs
1100 auto SecondBestA = std::next(BestA);
1101 auto SecondBestB = std::next(BestB);
1102 BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
1103 BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
1104 if (BestAScore < BestBScore)
1105 BestA = SecondBestA;
1106 else
1107 BestB = SecondBestB;
1108 }
1109 // Arrange for the BB edge to be in BestA if it exists.
1110 if (BestB->Src == BB)
1111 std::swap(BestA, BestB);
1112 return std::make_pair(*BestA, *BestB);
1113}
1114
1115/// Get the best successor from \p BB based on \p BB being part of a trellis.
1116/// We only handle trellises with 2 successors, so the algorithm is
1117/// straightforward: Find the best pair of edges that don't conflict. We find
1118/// the best incoming edge for each successor in the trellis. If those conflict,
1119/// we consider which of them should be replaced with the second best.
1120/// Upon return the two best edges will be in \p BestEdges. If one of the edges
1121/// comes from \p BB, it will be in \p BestEdges[0]
1122MachineBlockPlacement::BlockAndTailDupResult
1123MachineBlockPlacement::getBestTrellisSuccessor(
1124 const MachineBasicBlock *BB,
1125 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1126 BranchProbability AdjustedSumProb, const BlockChain &Chain,
1127 const BlockFilterSet *BlockFilter) {
1128
1129 BlockAndTailDupResult Result = {nullptr, false};
1130 SmallPtrSet<const MachineBasicBlock *, 4> Successors(llvm::from_range,
1131 BB->successors());
1132
1133 // We assume size 2 because it's common. For general n, we would have to do
1134 // the Hungarian algorithm, but it's not worth the complexity because more
1135 // than 2 successors is fairly uncommon, and a trellis even more so.
1136 if (Successors.size() != 2 || ViableSuccs.size() != 2)
1137 return Result;
1138
1139 // Collect the edge frequencies of all edges that form the trellis.
1141 int SuccIndex = 0;
1142 for (auto *Succ : ViableSuccs) {
1143 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1144 // Skip any placed predecessors that are not BB
1145 if (SuccPred != BB) {
1146 if (BlockFilter && !BlockFilter->count(SuccPred))
1147 continue;
1148 const BlockChain *SuccPredChain = BlockToChain[SuccPred];
1149 if (SuccPredChain == &Chain || SuccPredChain == BlockToChain[Succ])
1150 continue;
1151 }
1152 BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1153 MBPI->getEdgeProbability(SuccPred, Succ);
1154 Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1155 }
1156 ++SuccIndex;
1157 }
1158
1159 // Pick the best combination of 2 edges from all the edges in the trellis.
1160 WeightedEdge BestA, BestB;
1161 std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1162
1163 if (BestA.Src != BB) {
1164 // If we have a trellis, and BB doesn't have the best fallthrough edges,
1165 // we shouldn't choose any successor. We've already looked and there's a
1166 // better fallthrough edge for all the successors.
1167 LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1168 return Result;
1169 }
1170
1171 // Did we pick the triangle edge? If tail-duplication is profitable, do
1172 // that instead. Otherwise merge the triangle edge now while we know it is
1173 // optimal.
1174 if (BestA.Dest == BestB.Src) {
1175 // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1176 // would be better.
1177 MachineBasicBlock *Succ1 = BestA.Dest;
1178 MachineBasicBlock *Succ2 = BestB.Dest;
1179 // Check to see if tail-duplication would be profitable.
1180 if (allowTailDupPlacement(*F) && shouldTailDuplicate(Succ2) &&
1181 canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1182 isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1183 Chain, BlockFilter)) {
1184 LLVM_DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
1185 MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1186 dbgs() << " Selected: " << getBlockName(Succ2)
1187 << ", probability: " << Succ2Prob
1188 << " (Tail Duplicate)\n");
1189 Result.BB = Succ2;
1190 Result.ShouldTailDup = true;
1191 return Result;
1192 }
1193 }
1194 // We have already computed the optimal edge for the other side of the
1195 // trellis.
1196 ComputedEdges[BestB.Src] = {BestB.Dest, false};
1197
1198 auto TrellisSucc = BestA.Dest;
1199 LLVM_DEBUG(BranchProbability SuccProb = getAdjustedProbability(
1200 MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1201 dbgs() << " Selected: " << getBlockName(TrellisSucc)
1202 << ", probability: " << SuccProb << " (Trellis)\n");
1203 Result.BB = TrellisSucc;
1204 return Result;
1205}
1206
1207/// When the option allowTailDupPlacement() is on, this method checks if the
1208/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1209/// into all of its unplaced, unfiltered predecessors, that are not BB.
1210bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1211 const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1212 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1213 if (!shouldTailDuplicate(Succ))
1214 return false;
1215
1216 // The result of canTailDuplicate.
1217 bool Duplicate = true;
1218 // Number of possible duplication.
1219 unsigned int NumDup = 0;
1220
1221 // For CFG checking.
1222 SmallPtrSet<const MachineBasicBlock *, 4> Successors(llvm::from_range,
1223 BB->successors());
1224 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1225 // Make sure all unplaced and unfiltered predecessors can be
1226 // tail-duplicated into.
1227 // Skip any blocks that are already placed or not in this loop.
1228 if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) ||
1229 (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
1230 continue;
1231 if (!TailDup.canTailDuplicate(Succ, Pred)) {
1232 if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1233 // This will result in a trellis after tail duplication, so we don't
1234 // need to copy Succ into this predecessor. In the presence
1235 // of a trellis tail duplication can continue to be profitable.
1236 // For example:
1237 // A A
1238 // |\ |\
1239 // | \ | \
1240 // | C | C+BB
1241 // | / | |
1242 // |/ | |
1243 // BB => BB |
1244 // |\ |\/|
1245 // | \ |/\|
1246 // | D | D
1247 // | / | /
1248 // |/ |/
1249 // Succ Succ
1250 //
1251 // After BB was duplicated into C, the layout looks like the one on the
1252 // right. BB and C now have the same successors. When considering
1253 // whether Succ can be duplicated into all its unplaced predecessors, we
1254 // ignore C.
1255 // We can do this because C already has a profitable fallthrough, namely
1256 // D. TODO(iteratee): ignore sufficiently cold predecessors for
1257 // duplication and for this test.
1258 //
1259 // This allows trellises to be laid out in 2 separate chains
1260 // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1261 // because it allows the creation of 2 fallthrough paths with links
1262 // between them, and we correctly identify the best layout for these
1263 // CFGs. We want to extend trellises that the user created in addition
1264 // to trellises created by tail-duplication, so we just look for the
1265 // CFG.
1266 continue;
1267 Duplicate = false;
1268 continue;
1269 }
1270 NumDup++;
1271 }
1272
1273 // No possible duplication in current filter set.
1274 if (NumDup == 0)
1275 return false;
1276
1277 // If profile information is available, findDuplicateCandidates can do more
1278 // precise benefit analysis.
1279 if (F->getFunction().hasProfileData())
1280 return true;
1281
1282 // This is mainly for function exit BB.
1283 // The integrated tail duplication is really designed for increasing
1284 // fallthrough from predecessors from Succ to its successors. We may need
1285 // other machanism to handle different cases.
1286 if (Succ->succ_empty())
1287 return true;
1288
1289 // Plus the already placed predecessor.
1290 NumDup++;
1291
1292 // If the duplication candidate has more unplaced predecessors than
1293 // successors, the extra duplication can't bring more fallthrough.
1294 //
1295 // Pred1 Pred2 Pred3
1296 // \ | /
1297 // \ | /
1298 // \ | /
1299 // Dup
1300 // / \
1301 // / \
1302 // Succ1 Succ2
1303 //
1304 // In this example Dup has 2 successors and 3 predecessors, duplication of Dup
1305 // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
1306 // but the duplication into Pred3 can't increase fallthrough.
1307 //
1308 // A small number of extra duplication may not hurt too much. We need a better
1309 // heuristic to handle it.
1310 if ((NumDup > Succ->succ_size()) || !Duplicate)
1311 return false;
1312
1313 return true;
1314}
1315
1316/// Find chains of triangles where we believe it would be profitable to
1317/// tail-duplicate them all, but a local analysis would not find them.
1318/// There are 3 ways this can be profitable:
1319/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1320/// longer chains)
1321/// 2) The chains are statically correlated. Branch probabilities have a very
1322/// U-shaped distribution.
1323/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1324/// If the branches in a chain are likely to be from the same side of the
1325/// distribution as their predecessor, but are independent at runtime, this
1326/// transformation is profitable. (Because the cost of being wrong is a small
1327/// fixed cost, unlike the standard triangle layout where the cost of being
1328/// wrong scales with the # of triangles.)
1329/// 3) The chains are dynamically correlated. If the probability that a previous
1330/// branch was taken positively influences whether the next branch will be
1331/// taken
1332/// We believe that 2 and 3 are common enough to justify the small margin in 1.
1333void MachineBlockPlacement::precomputeTriangleChains() {
1334 struct TriangleChain {
1335 std::vector<MachineBasicBlock *> Edges;
1336
1337 TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1338 : Edges({src, dst}) {}
1339
1340 void append(MachineBasicBlock *dst) {
1341 assert(getKey()->isSuccessor(dst) &&
1342 "Attempting to append a block that is not a successor.");
1343 Edges.push_back(dst);
1344 }
1345
1346 unsigned count() const { return Edges.size() - 1; }
1347
1348 MachineBasicBlock *getKey() const { return Edges.back(); }
1349 };
1350
1351 if (TriangleChainCount == 0)
1352 return;
1353
1354 LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1355 // Map from last block to the chain that contains it. This allows us to extend
1356 // chains as we find new triangles.
1357 DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap;
1358 for (MachineBasicBlock &BB : *F) {
1359 // If BB doesn't have 2 successors, it doesn't start a triangle.
1360 if (BB.succ_size() != 2)
1361 continue;
1362 MachineBasicBlock *PDom = nullptr;
1363 for (MachineBasicBlock *Succ : BB.successors()) {
1364 if (!MPDT->dominates(Succ, &BB))
1365 continue;
1366 PDom = Succ;
1367 break;
1368 }
1369 // If BB doesn't have a post-dominating successor, it doesn't form a
1370 // triangle.
1371 if (PDom == nullptr)
1372 continue;
1373 // If PDom has a hint that it is low probability, skip this triangle.
1374 if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1375 continue;
1376 // If PDom isn't eligible for duplication, this isn't the kind of triangle
1377 // we're looking for.
1378 if (!shouldTailDuplicate(PDom))
1379 continue;
1380 bool CanTailDuplicate = true;
1381 // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1382 // isn't the kind of triangle we're looking for.
1383 for (MachineBasicBlock *Pred : PDom->predecessors()) {
1384 if (Pred == &BB)
1385 continue;
1386 if (!TailDup.canTailDuplicate(PDom, Pred)) {
1387 CanTailDuplicate = false;
1388 break;
1389 }
1390 }
1391 // If we can't tail-duplicate PDom to its predecessors, then skip this
1392 // triangle.
1393 if (!CanTailDuplicate)
1394 continue;
1395
1396 // Now we have an interesting triangle. Insert it if it's not part of an
1397 // existing chain.
1398 // Note: This cannot be replaced with a call insert() or emplace() because
1399 // the find key is BB, but the insert/emplace key is PDom.
1400 auto Found = TriangleChainMap.find(&BB);
1401 // If it is, remove the chain from the map, grow it, and put it back in the
1402 // map with the end as the new key.
1403 if (Found != TriangleChainMap.end()) {
1404 TriangleChain Chain = std::move(Found->second);
1405 TriangleChainMap.erase(Found);
1406 Chain.append(PDom);
1407 TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1408 } else {
1409 auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1410 assert(InsertResult.second && "Block seen twice.");
1411 (void)InsertResult;
1412 }
1413 }
1414
1415 // Iterating over a DenseMap is safe here, because the only thing in the body
1416 // of the loop is inserting into another DenseMap (ComputedEdges).
1417 // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1418 for (auto &ChainPair : TriangleChainMap) {
1419 TriangleChain &Chain = ChainPair.second;
1420 // Benchmarking has shown that due to branch correlation duplicating 2 or
1421 // more triangles is profitable, despite the calculations assuming
1422 // independence.
1423 if (Chain.count() < TriangleChainCount)
1424 continue;
1425 MachineBasicBlock *dst = Chain.Edges.back();
1426 Chain.Edges.pop_back();
1427 for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1428 LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1429 << getBlockName(dst)
1430 << " as pre-computed based on triangles.\n");
1431
1432 auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1433 assert(InsertResult.second && "Block seen twice.");
1434 (void)InsertResult;
1435
1436 dst = src;
1437 }
1438 }
1439}
1440
1441// When profile is not present, return the StaticLikelyProb.
1442// When profile is available, we need to handle the triangle-shape CFG.
1443static BranchProbability
1445 if (!BB->getParent()->getFunction().hasProfileData())
1447 if (BB->succ_size() == 2) {
1448 const MachineBasicBlock *Succ1 = *BB->succ_begin();
1449 const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1450 if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1451 /* See case 1 below for the cost analysis. For BB->Succ to
1452 * be taken with smaller cost, the following needs to hold:
1453 * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1454 * So the threshold T in the calculation below
1455 * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1456 * So T / (1 - T) = 2, Yielding T = 2/3
1457 * Also adding user specified branch bias, we have
1458 * T = (2/3)*(ProfileLikelyProb/50)
1459 * = (2*ProfileLikelyProb)/150)
1460 */
1461 return BranchProbability(2 * ProfileLikelyProb, 150);
1462 }
1463 }
1465}
1466
1467/// Checks to see if the layout candidate block \p Succ has a better layout
1468/// predecessor than \c BB. If yes, returns true.
1469/// \p SuccProb: The probability adjusted for only remaining blocks.
1470/// Only used for logging
1471/// \p RealSuccProb: The un-adjusted probability.
1472/// \p Chain: The chain that BB belongs to and Succ is being considered for.
1473/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1474/// considered
1475bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1476 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1477 const BlockChain &SuccChain, BranchProbability SuccProb,
1478 BranchProbability RealSuccProb, const BlockChain &Chain,
1479 const BlockFilterSet *BlockFilter) {
1480
1481 // There isn't a better layout when there are no unscheduled predecessors.
1482 if (SuccChain.UnscheduledPredecessors == 0)
1483 return false;
1484
1485 // Compile-time optimization: runtime is quadratic in the number of
1486 // predecessors. For such uncommon cases, exit early.
1487 if (Succ->pred_size() > PredecessorLimit)
1488 return false;
1489
1490 // There are two basic scenarios here:
1491 // -------------------------------------
1492 // Case 1: triangular shape CFG (if-then):
1493 // BB
1494 // | \
1495 // | \
1496 // | Pred
1497 // | /
1498 // Succ
1499 // In this case, we are evaluating whether to select edge -> Succ, e.g.
1500 // set Succ as the layout successor of BB. Picking Succ as BB's
1501 // successor breaks the CFG constraints (FIXME: define these constraints).
1502 // With this layout, Pred BB
1503 // is forced to be outlined, so the overall cost will be cost of the
1504 // branch taken from BB to Pred, plus the cost of back taken branch
1505 // from Pred to Succ, as well as the additional cost associated
1506 // with the needed unconditional jump instruction from Pred To Succ.
1507
1508 // The cost of the topological order layout is the taken branch cost
1509 // from BB to Succ, so to make BB->Succ a viable candidate, the following
1510 // must hold:
1511 // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1512 // < freq(BB->Succ) * taken_branch_cost.
1513 // Ignoring unconditional jump cost, we get
1514 // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1515 // prob(BB->Succ) > 2 * prob(BB->Pred)
1516 //
1517 // When real profile data is available, we can precisely compute the
1518 // probability threshold that is needed for edge BB->Succ to be considered.
1519 // Without profile data, the heuristic requires the branch bias to be
1520 // a lot larger to make sure the signal is very strong (e.g. 80% default).
1521 // -----------------------------------------------------------------
1522 // Case 2: diamond like CFG (if-then-else):
1523 // S
1524 // / \
1525 // | \
1526 // BB Pred
1527 // \ /
1528 // Succ
1529 // ..
1530 //
1531 // The current block is BB and edge BB->Succ is now being evaluated.
1532 // Note that edge S->BB was previously already selected because
1533 // prob(S->BB) > prob(S->Pred).
1534 // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1535 // choose Pred, we will have a topological ordering as shown on the left
1536 // in the picture below. If we choose Succ, we have the solution as shown
1537 // on the right:
1538 //
1539 // topo-order:
1540 //
1541 // S----- ---S
1542 // | | | |
1543 // ---BB | | BB
1544 // | | | |
1545 // | Pred-- | Succ--
1546 // | | | |
1547 // ---Succ ---Pred--
1548 //
1549 // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1550 // = freq(S->Pred) + freq(S->BB)
1551 //
1552 // If we have profile data (i.e, branch probabilities can be trusted), the
1553 // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1554 // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1555 // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1556 // means the cost of topological order is greater.
1557 // When profile data is not available, however, we need to be more
1558 // conservative. If the branch prediction is wrong, breaking the topo-order
1559 // will actually yield a layout with large cost. For this reason, we need
1560 // strong biased branch at block S with Prob(S->BB) in order to select
1561 // BB->Succ. This is equivalent to looking the CFG backward with backward
1562 // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1563 // profile data).
1564 // --------------------------------------------------------------------------
1565 // Case 3: forked diamond
1566 // S
1567 // / \
1568 // / \
1569 // BB Pred
1570 // | \ / |
1571 // | \ / |
1572 // | X |
1573 // | / \ |
1574 // | / \ |
1575 // S1 S2
1576 //
1577 // The current block is BB and edge BB->S1 is now being evaluated.
1578 // As above S->BB was already selected because
1579 // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1580 //
1581 // topo-order:
1582 //
1583 // S-------| ---S
1584 // | | | |
1585 // ---BB | | BB
1586 // | | | |
1587 // | Pred----| | S1----
1588 // | | | |
1589 // --(S1 or S2) ---Pred--
1590 // |
1591 // S2
1592 //
1593 // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1594 // + min(freq(Pred->S1), freq(Pred->S2))
1595 // Non-topo-order cost:
1596 // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1597 // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1598 // is 0. Then the non topo layout is better when
1599 // freq(S->Pred) < freq(BB->S1).
1600 // This is exactly what is checked below.
1601 // Note there are other shapes that apply (Pred may not be a single block,
1602 // but they all fit this general pattern.)
1603 BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);
1604
1605 // Make sure that a hot successor doesn't have a globally more
1606 // important predecessor.
1607 BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1608 bool BadCFGConflict = false;
1609
1610 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1611 BlockChain *PredChain = BlockToChain[Pred];
1612 if (Pred == Succ || PredChain == &SuccChain ||
1613 (BlockFilter && !BlockFilter->count(Pred)) || PredChain == &Chain ||
1614 Pred != *std::prev(PredChain->end()) ||
1615 // This check is redundant except for look ahead. This function is
1616 // called for lookahead by isProfitableToTailDup when BB hasn't been
1617 // placed yet.
1618 (Pred == BB))
1619 continue;
1620 // Do backward checking.
1621 // For all cases above, we need a backward checking to filter out edges that
1622 // are not 'strongly' biased.
1623 // BB Pred
1624 // \ /
1625 // Succ
1626 // We select edge BB->Succ if
1627 // freq(BB->Succ) > freq(Succ) * HotProb
1628 // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1629 // HotProb
1630 // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1631 // Case 1 is covered too, because the first equation reduces to:
1632 // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1633 BlockFrequency PredEdgeFreq =
1634 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1635 if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1636 BadCFGConflict = true;
1637 break;
1638 }
1639 }
1640
1641 if (BadCFGConflict) {
1642 LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1643 << SuccProb << " (prob) (non-cold CFG conflict)\n");
1644 return true;
1645 }
1646
1647 return false;
1648}
1649
1650/// Select the best successor for a block.
1651///
1652/// This looks across all successors of a particular block and attempts to
1653/// select the "best" one to be the layout successor. It only considers direct
1654/// successors which also pass the block filter. It will attempt to avoid
1655/// breaking CFG structure, but cave and break such structures in the case of
1656/// very hot successor edges.
1657///
1658/// \returns The best successor block found, or null if none are viable, along
1659/// with a boolean indicating if tail duplication is necessary.
1660MachineBlockPlacement::BlockAndTailDupResult
1661MachineBlockPlacement::selectBestSuccessor(const MachineBasicBlock *BB,
1662 const BlockChain &Chain,
1663 const BlockFilterSet *BlockFilter) {
1664 const BranchProbability HotProb(StaticLikelyProb, 100);
1665
1666 BlockAndTailDupResult BestSucc = {nullptr, false};
1667 auto BestProb = BranchProbability::getZero();
1668
1669 SmallVector<MachineBasicBlock *, 4> Successors;
1670 auto AdjustedSumProb =
1671 collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1672
1673 LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1674 << "\n");
1675
1676 // if we already precomputed the best successor for BB, return that if still
1677 // applicable.
1678 auto FoundEdge = ComputedEdges.find(BB);
1679 if (FoundEdge != ComputedEdges.end()) {
1680 MachineBasicBlock *Succ = FoundEdge->second.BB;
1681 ComputedEdges.erase(FoundEdge);
1682 BlockChain *SuccChain = BlockToChain[Succ];
1683 if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1684 SuccChain != &Chain && Succ == *SuccChain->begin())
1685 return FoundEdge->second;
1686 }
1687
1688 // if BB is part of a trellis, Use the trellis to determine the optimal
1689 // fallthrough edges
1690 if (isTrellis(BB, Successors, Chain, BlockFilter))
1691 return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1692 BlockFilter);
1693
1694 // For blocks with CFG violations, we may be able to lay them out anyway with
1695 // tail-duplication. We keep this vector so we can perform the probability
1696 // calculations the minimum number of times.
1698 DupCandidates;
1699 for (MachineBasicBlock *Succ : Successors) {
1700 auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1701 BranchProbability SuccProb =
1702 getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1703
1704 BlockChain &SuccChain = *BlockToChain[Succ];
1705 // Skip the edge \c BB->Succ if block \c Succ has a better layout
1706 // predecessor that yields lower global cost.
1707 if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1708 Chain, BlockFilter)) {
1709 // If tail duplication would make Succ profitable, place it.
1710 if (allowTailDupPlacement(*F) && shouldTailDuplicate(Succ))
1711 DupCandidates.emplace_back(SuccProb, Succ);
1712 continue;
1713 }
1714
1715 LLVM_DEBUG(
1716 dbgs() << " Candidate: " << getBlockName(Succ)
1717 << ", probability: " << SuccProb
1718 << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1719 << "\n");
1720
1721 if (BestSucc.BB && BestProb >= SuccProb) {
1722 LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1723 continue;
1724 }
1725
1726 LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1727 BestSucc.BB = Succ;
1728 BestProb = SuccProb;
1729 }
1730 // Handle the tail duplication candidates in order of decreasing probability.
1731 // Stop at the first one that is profitable. Also stop if they are less
1732 // profitable than BestSucc. Position is important because we preserve it and
1733 // prefer first best match. Here we aren't comparing in order, so we capture
1734 // the position instead.
1735 llvm::stable_sort(DupCandidates,
1736 [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1737 std::tuple<BranchProbability, MachineBasicBlock *> R) {
1738 return std::get<0>(L) > std::get<0>(R);
1739 });
1740 for (auto &Tup : DupCandidates) {
1741 BranchProbability DupProb;
1742 MachineBasicBlock *Succ;
1743 std::tie(DupProb, Succ) = Tup;
1744 if (DupProb < BestProb)
1745 break;
1746 if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) &&
1747 (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1748 LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1749 << ", probability: " << DupProb
1750 << " (Tail Duplicate)\n");
1751 BestSucc.BB = Succ;
1752 BestSucc.ShouldTailDup = true;
1753 break;
1754 }
1755 }
1756
1757 if (BestSucc.BB)
1758 LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1759
1760 return BestSucc;
1761}
1762
1763/// Select the best block from a worklist.
1764///
1765/// This looks through the provided worklist as a list of candidate basic
1766/// blocks and select the most profitable one to place. The definition of
1767/// profitable only really makes sense in the context of a loop. This returns
1768/// the most frequently visited block in the worklist, which in the case of
1769/// a loop, is the one most desirable to be physically close to the rest of the
1770/// loop body in order to improve i-cache behavior.
1771///
1772/// \returns The best block found, or null if none are viable.
1773MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1774 const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1775 // Once we need to walk the worklist looking for a candidate, cleanup the
1776 // worklist of already placed entries.
1777 // FIXME: If this shows up on profiles, it could be folded (at the cost of
1778 // some code complexity) into the loop below.
1779 llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) {
1780 return BlockToChain.lookup(BB) == &Chain;
1781 });
1782
1783 if (WorkList.empty())
1784 return nullptr;
1785
1786 bool IsEHPad = WorkList[0]->isEHPad();
1787
1788 MachineBasicBlock *BestBlock = nullptr;
1789 BlockFrequency BestFreq;
1790 for (MachineBasicBlock *MBB : WorkList) {
1791 assert(MBB->isEHPad() == IsEHPad &&
1792 "EHPad mismatch between block and work list.");
1793
1794 BlockChain &SuccChain = *BlockToChain[MBB];
1795 if (&SuccChain == &Chain)
1796 continue;
1797
1798 assert(SuccChain.UnscheduledPredecessors == 0 &&
1799 "Found CFG-violating block");
1800
1801 BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1802 LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> "
1803 << printBlockFreq(MBFI->getMBFI(), CandidateFreq)
1804 << " (freq)\n");
1805
1806 // For ehpad, we layout the least probable first as to avoid jumping back
1807 // from least probable landingpads to more probable ones.
1808 //
1809 // FIXME: Using probability is probably (!) not the best way to achieve
1810 // this. We should probably have a more principled approach to layout
1811 // cleanup code.
1812 //
1813 // The goal is to get:
1814 //
1815 // +--------------------------+
1816 // | V
1817 // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1818 //
1819 // Rather than:
1820 //
1821 // +-------------------------------------+
1822 // V |
1823 // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1824 if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1825 continue;
1826
1827 BestBlock = MBB;
1828 BestFreq = CandidateFreq;
1829 }
1830
1831 return BestBlock;
1832}
1833
1834/// Retrieve the first unplaced basic block in the entire function.
1835///
1836/// This routine is called when we are unable to use the CFG to walk through
1837/// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1838/// We walk through the function's blocks in order, starting from the
1839/// LastUnplacedBlockIt. We update this iterator on each call to avoid
1840/// re-scanning the entire sequence on repeated calls to this routine.
1841MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1842 const BlockChain &PlacedChain,
1843 MachineFunction::iterator &PrevUnplacedBlockIt) {
1844
1845 for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1846 ++I) {
1847 if (BlockChain *Chain = BlockToChain[&*I]; Chain != &PlacedChain) {
1848 PrevUnplacedBlockIt = I;
1849 // Now select the head of the chain to which the unplaced block belongs
1850 // as the block to place. This will force the entire chain to be placed,
1851 // and satisfies the requirements of merging chains.
1852 return *Chain->begin();
1853 }
1854 }
1855 return nullptr;
1856}
1857
1858/// Retrieve the first unplaced basic block among the blocks in BlockFilter.
1859///
1860/// This is similar to getFirstUnplacedBlock for the entire function, but since
1861/// the size of BlockFilter is typically far less than the number of blocks in
1862/// the entire function, iterating through the BlockFilter is more efficient.
1863/// When processing the entire funciton, using the version without BlockFilter
1864/// has a complexity of #(loops in function) * #(blocks in function), while this
1865/// version has a complexity of sum(#(loops in block) foreach block in function)
1866/// which is always smaller. For long function mostly sequential in structure,
1867/// the complexity is amortized to 1 * #(blocks in function).
1868MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1869 const BlockChain &PlacedChain,
1870 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
1871 const BlockFilterSet *BlockFilter) {
1872 assert(BlockFilter);
1873 for (; PrevUnplacedBlockInFilterIt != BlockFilter->end();
1874 ++PrevUnplacedBlockInFilterIt) {
1875 BlockChain *C = BlockToChain[*PrevUnplacedBlockInFilterIt];
1876 if (C != &PlacedChain) {
1877 return *C->begin();
1878 }
1879 }
1880 return nullptr;
1881}
1882
1883void MachineBlockPlacement::fillWorkLists(
1884 const MachineBasicBlock *MBB, SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1885 const BlockFilterSet *BlockFilter = nullptr) {
1886 BlockChain &Chain = *BlockToChain[MBB];
1887 if (!UpdatedPreds.insert(&Chain).second)
1888 return;
1889
1890 assert(
1891 Chain.UnscheduledPredecessors == 0 &&
1892 "Attempting to place block with unscheduled predecessors in worklist.");
1893 for (MachineBasicBlock *ChainBB : Chain) {
1894 assert(BlockToChain[ChainBB] == &Chain &&
1895 "Block in chain doesn't match BlockToChain map.");
1896 for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1897 if (BlockFilter && !BlockFilter->count(Pred))
1898 continue;
1899 if (BlockToChain[Pred] == &Chain)
1900 continue;
1901 ++Chain.UnscheduledPredecessors;
1902 }
1903 }
1904
1905 if (Chain.UnscheduledPredecessors != 0)
1906 return;
1907
1908 MachineBasicBlock *BB = *Chain.begin();
1909 if (BB->isEHPad())
1910 EHPadWorkList.push_back(BB);
1911 else
1912 BlockWorkList.push_back(BB);
1913}
1914
1915void MachineBlockPlacement::buildChain(const MachineBasicBlock *HeadBB,
1916 BlockChain &Chain,
1917 BlockFilterSet *BlockFilter) {
1918 assert(HeadBB && "BB must not be null.\n");
1919 assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1920 MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1921 BlockFilterSet::iterator PrevUnplacedBlockInFilterIt;
1922 if (BlockFilter)
1923 PrevUnplacedBlockInFilterIt = BlockFilter->begin();
1924
1925 const MachineBasicBlock *LoopHeaderBB = HeadBB;
1926 markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1927 MachineBasicBlock *BB = *std::prev(Chain.end());
1928 while (true) {
1929 assert(BB && "null block found at end of chain in loop.");
1930 assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1931 assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1932
1933 // Look for the best viable successor if there is one to place immediately
1934 // after this block.
1935 auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1936 MachineBasicBlock *BestSucc = Result.BB;
1937 bool ShouldTailDup = Result.ShouldTailDup;
1938 if (allowTailDupPlacement(*F))
1939 ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(
1940 BB, BestSucc, Chain, BlockFilter));
1941
1942 // If an immediate successor isn't available, look for the best viable
1943 // block among those we've identified as not violating the loop's CFG at
1944 // this point. This won't be a fallthrough, but it will increase locality.
1945 if (!BestSucc)
1946 BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1947 if (!BestSucc)
1948 BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1949
1950 if (!BestSucc) {
1951 if (BlockFilter)
1952 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockInFilterIt,
1953 BlockFilter);
1954 else
1955 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt);
1956 if (!BestSucc)
1957 break;
1958
1959 LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1960 "layout successor until the CFG reduces\n");
1961 }
1962
1963 // Placement may have changed tail duplication opportunities.
1964 // Check for that now.
1965 if (allowTailDupPlacement(*F) && BestSucc && ShouldTailDup) {
1966 repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1967 BlockFilter, PrevUnplacedBlockIt,
1968 PrevUnplacedBlockInFilterIt);
1969 // If the chosen successor was duplicated into BB, don't bother laying
1970 // it out, just go round the loop again with BB as the chain end.
1971 if (!BB->isSuccessor(BestSucc))
1972 continue;
1973 }
1974
1975 // Place this block, updating the datastructures to reflect its placement.
1976 BlockChain &SuccChain = *BlockToChain[BestSucc];
1977 // Zero out UnscheduledPredecessors for the successor we're about to merge
1978 // in case we selected a successor that didn't fit naturally into the CFG.
1979 SuccChain.UnscheduledPredecessors = 0;
1980 LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1981 << getBlockName(BestSucc) << "\n");
1982 markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1983 Chain.merge(BestSucc, &SuccChain);
1984 BB = *std::prev(Chain.end());
1985 }
1986
1987 LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1988 << getBlockName(*Chain.begin()) << "\n");
1989}
1990
1991// If bottom of block BB has only one successor OldTop, in most cases it is
1992// profitable to move it before OldTop, except the following case:
1993//
1994// -->OldTop<-
1995// | . |
1996// | . |
1997// | . |
1998// ---Pred |
1999// | |
2000// BB-----
2001//
2002// If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
2003// layout the other successor below it, so it can't reduce taken branch.
2004// In this case we keep its original layout.
2005bool MachineBlockPlacement::canMoveBottomBlockToTop(
2006 const MachineBasicBlock *BottomBlock, const MachineBasicBlock *OldTop) {
2007 if (BottomBlock->pred_size() != 1)
2008 return true;
2009 MachineBasicBlock *Pred = *BottomBlock->pred_begin();
2010 if (Pred->succ_size() != 2)
2011 return true;
2012
2013 MachineBasicBlock *OtherBB = *Pred->succ_begin();
2014 if (OtherBB == BottomBlock)
2015 OtherBB = *Pred->succ_rbegin();
2016 if (OtherBB == OldTop)
2017 return false;
2018
2019 return true;
2020}
2021
2022// Find out the possible fall through frequence to the top of a loop.
2023BlockFrequency
2024MachineBlockPlacement::TopFallThroughFreq(const MachineBasicBlock *Top,
2025 const BlockFilterSet &LoopBlockSet) {
2026 BlockFrequency MaxFreq = BlockFrequency(0);
2027 for (MachineBasicBlock *Pred : Top->predecessors()) {
2028 BlockChain *PredChain = BlockToChain[Pred];
2029 if (!LoopBlockSet.count(Pred) &&
2030 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2031 // Found a Pred block can be placed before Top.
2032 // Check if Top is the best successor of Pred.
2033 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2034 bool TopOK = true;
2035 for (MachineBasicBlock *Succ : Pred->successors()) {
2036 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2037 BlockChain *SuccChain = BlockToChain[Succ];
2038 // Check if Succ can be placed after Pred.
2039 // Succ should not be in any chain, or it is the head of some chain.
2040 if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
2041 (!SuccChain || Succ == *SuccChain->begin())) {
2042 TopOK = false;
2043 break;
2044 }
2045 }
2046 if (TopOK) {
2047 BlockFrequency EdgeFreq =
2048 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Top);
2049 if (EdgeFreq > MaxFreq)
2050 MaxFreq = EdgeFreq;
2051 }
2052 }
2053 }
2054 return MaxFreq;
2055}
2056
2057// Compute the fall through gains when move NewTop before OldTop.
2058//
2059// In following diagram, edges marked as "-" are reduced fallthrough, edges
2060// marked as "+" are increased fallthrough, this function computes
2061//
2062// SUM(increased fallthrough) - SUM(decreased fallthrough)
2063//
2064// |
2065// | -
2066// V
2067// --->OldTop
2068// | .
2069// | .
2070// +| . +
2071// | Pred --->
2072// | |-
2073// | V
2074// --- NewTop <---
2075// |-
2076// V
2077//
2078BlockFrequency MachineBlockPlacement::FallThroughGains(
2079 const MachineBasicBlock *NewTop, const MachineBasicBlock *OldTop,
2080 const MachineBasicBlock *ExitBB, const BlockFilterSet &LoopBlockSet) {
2081 BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
2082 BlockFrequency FallThrough2Exit = BlockFrequency(0);
2083 if (ExitBB)
2084 FallThrough2Exit =
2085 MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, ExitBB);
2086 BlockFrequency BackEdgeFreq =
2087 MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, OldTop);
2088
2089 // Find the best Pred of NewTop.
2090 MachineBasicBlock *BestPred = nullptr;
2091 BlockFrequency FallThroughFromPred = BlockFrequency(0);
2092 for (MachineBasicBlock *Pred : NewTop->predecessors()) {
2093 if (!LoopBlockSet.count(Pred))
2094 continue;
2095 BlockChain *PredChain = BlockToChain[Pred];
2096 if (!PredChain || Pred == *std::prev(PredChain->end())) {
2097 BlockFrequency EdgeFreq =
2098 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, NewTop);
2099 if (EdgeFreq > FallThroughFromPred) {
2100 FallThroughFromPred = EdgeFreq;
2101 BestPred = Pred;
2102 }
2103 }
2104 }
2105
2106 // If NewTop is not placed after Pred, another successor can be placed
2107 // after Pred.
2108 BlockFrequency NewFreq = BlockFrequency(0);
2109 if (BestPred) {
2110 for (MachineBasicBlock *Succ : BestPred->successors()) {
2111 if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
2112 continue;
2113 if (ComputedEdges.contains(Succ))
2114 continue;
2115 BlockChain *SuccChain = BlockToChain[Succ];
2116 if ((SuccChain && (Succ != *SuccChain->begin())) ||
2117 (SuccChain == BlockToChain[BestPred]))
2118 continue;
2119 BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
2120 MBPI->getEdgeProbability(BestPred, Succ);
2121 if (EdgeFreq > NewFreq)
2122 NewFreq = EdgeFreq;
2123 }
2124 BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
2125 MBPI->getEdgeProbability(BestPred, NewTop);
2126 if (NewFreq > OrigEdgeFreq) {
2127 // If NewTop is not the best successor of Pred, then Pred doesn't
2128 // fallthrough to NewTop. So there is no FallThroughFromPred and
2129 // NewFreq.
2130 NewFreq = BlockFrequency(0);
2131 FallThroughFromPred = BlockFrequency(0);
2132 }
2133 }
2134
2135 BlockFrequency Result = BlockFrequency(0);
2136 BlockFrequency Gains = BackEdgeFreq + NewFreq;
2137 BlockFrequency Lost =
2138 FallThrough2Top + FallThrough2Exit + FallThroughFromPred;
2139 if (Gains > Lost)
2140 Result = Gains - Lost;
2141 return Result;
2142}
2143
2144/// Helper function of findBestLoopTop. Find the best loop top block
2145/// from predecessors of old top.
2146///
2147/// Look for a block which is strictly better than the old top for laying
2148/// out before the old top of the loop. This looks for only two patterns:
2149///
2150/// 1. a block has only one successor, the old loop top
2151///
2152/// Because such a block will always result in an unconditional jump,
2153/// rotating it in front of the old top is always profitable.
2154///
2155/// 2. a block has two successors, one is old top, another is exit
2156/// and it has more than one predecessors
2157///
2158/// If it is below one of its predecessors P, only P can fall through to
2159/// it, all other predecessors need a jump to it, and another conditional
2160/// jump to loop header. If it is moved before loop header, all its
2161/// predecessors jump to it, then fall through to loop header. So all its
2162/// predecessors except P can reduce one taken branch.
2163/// At the same time, move it before old top increases the taken branch
2164/// to loop exit block, so the reduced taken branch will be compared with
2165/// the increased taken branch to the loop exit block.
2166MachineBasicBlock *MachineBlockPlacement::findBestLoopTopHelper(
2167 MachineBasicBlock *OldTop, const MachineLoop &L,
2168 const BlockFilterSet &LoopBlockSet) {
2169 // Check that the header hasn't been fused with a preheader block due to
2170 // crazy branches. If it has, we need to start with the header at the top to
2171 // prevent pulling the preheader into the loop body.
2172 BlockChain &HeaderChain = *BlockToChain[OldTop];
2173 if (!LoopBlockSet.count(*HeaderChain.begin()))
2174 return OldTop;
2175 if (OldTop != *HeaderChain.begin())
2176 return OldTop;
2177
2178 LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
2179 << "\n");
2180
2181 BlockFrequency BestGains = BlockFrequency(0);
2182 MachineBasicBlock *BestPred = nullptr;
2183 for (MachineBasicBlock *Pred : OldTop->predecessors()) {
2184 if (!LoopBlockSet.count(Pred))
2185 continue;
2186 if (Pred == L.getHeader())
2187 continue;
2188 LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
2189 << Pred->succ_size() << " successors, "
2190 << printBlockFreq(MBFI->getMBFI(), *Pred) << " freq\n");
2191 if (Pred->succ_size() > 2)
2192 continue;
2193
2194 MachineBasicBlock *OtherBB = nullptr;
2195 if (Pred->succ_size() == 2) {
2196 OtherBB = *Pred->succ_begin();
2197 if (OtherBB == OldTop)
2198 OtherBB = *Pred->succ_rbegin();
2199 }
2200
2201 if (!canMoveBottomBlockToTop(Pred, OldTop))
2202 continue;
2203
2204 BlockFrequency Gains =
2205 FallThroughGains(Pred, OldTop, OtherBB, LoopBlockSet);
2206 if ((Gains > BlockFrequency(0)) &&
2207 (Gains > BestGains ||
2208 ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
2209 BestPred = Pred;
2210 BestGains = Gains;
2211 }
2212 }
2213
2214 // If no direct predecessor is fine, just use the loop header.
2215 if (!BestPred) {
2216 LLVM_DEBUG(dbgs() << " final top unchanged\n");
2217 return OldTop;
2218 }
2219
2220 // Walk backwards through any straight line of predecessors.
2221 while (BestPred->pred_size() == 1 &&
2222 (*BestPred->pred_begin())->succ_size() == 1 &&
2223 *BestPred->pred_begin() != L.getHeader())
2224 BestPred = *BestPred->pred_begin();
2225
2226 LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2227 return BestPred;
2228}
2229
2230/// Find the best loop top block for layout.
2231///
2232/// This function iteratively calls findBestLoopTopHelper, until no new better
2233/// BB can be found.
2234MachineBasicBlock *
2235MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2236 const BlockFilterSet &LoopBlockSet) {
2237 // Placing the latch block before the header may introduce an extra branch
2238 // that skips this block the first time the loop is executed, which we want
2239 // to avoid when optimising for size.
2240 // FIXME: in theory there is a case that does not introduce a new branch,
2241 // i.e. when the layout predecessor does not fallthrough to the loop header.
2242 // In practice this never happens though: there always seems to be a preheader
2243 // that can fallthrough and that is also placed before the header.
2244 if (llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get()))
2245 return L.getHeader();
2246
2247 MachineBasicBlock *OldTop = nullptr;
2248 MachineBasicBlock *NewTop = L.getHeader();
2249 while (NewTop != OldTop) {
2250 OldTop = NewTop;
2251 NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2252 if (NewTop != OldTop)
2253 ComputedEdges[NewTop] = {OldTop, false};
2254 }
2255 return NewTop;
2256}
2257
2258/// Find the best loop exiting block for layout.
2259///
2260/// This routine implements the logic to analyze the loop looking for the best
2261/// block to layout at the top of the loop. Typically this is done to maximize
2262/// fallthrough opportunities.
2263MachineBasicBlock *
2264MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2265 const BlockFilterSet &LoopBlockSet,
2266 BlockFrequency &ExitFreq) {
2267 // We don't want to layout the loop linearly in all cases. If the loop header
2268 // is just a normal basic block in the loop, we want to look for what block
2269 // within the loop is the best one to layout at the top. However, if the loop
2270 // header has be pre-merged into a chain due to predecessors not having
2271 // analyzable branches, *and* the predecessor it is merged with is *not* part
2272 // of the loop, rotating the header into the middle of the loop will create
2273 // a non-contiguous range of blocks which is Very Bad. So start with the
2274 // header and only rotate if safe.
2275 BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2276 if (!LoopBlockSet.count(*HeaderChain.begin()))
2277 return nullptr;
2278
2279 BlockFrequency BestExitEdgeFreq;
2280 unsigned BestExitLoopDepth = 0;
2281 MachineBasicBlock *ExitingBB = nullptr;
2282 // If there are exits to outer loops, loop rotation can severely limit
2283 // fallthrough opportunities unless it selects such an exit. Keep a set of
2284 // blocks where rotating to exit with that block will reach an outer loop.
2285 SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2286
2287 LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2288 << getBlockName(L.getHeader()) << "\n");
2289 for (MachineBasicBlock *MBB : L.getBlocks()) {
2290 BlockChain &Chain = *BlockToChain[MBB];
2291 // Ensure that this block is at the end of a chain; otherwise it could be
2292 // mid-way through an inner loop or a successor of an unanalyzable branch.
2293 if (MBB != *std::prev(Chain.end()))
2294 continue;
2295
2296 // Now walk the successors. We need to establish whether this has a viable
2297 // exiting successor and whether it has a viable non-exiting successor.
2298 // We store the old exiting state and restore it if a viable looping
2299 // successor isn't found.
2300 MachineBasicBlock *OldExitingBB = ExitingBB;
2301 BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2302 bool HasLoopingSucc = false;
2303 for (MachineBasicBlock *Succ : MBB->successors()) {
2304 if (Succ->isEHPad())
2305 continue;
2306 if (Succ == MBB)
2307 continue;
2308 BlockChain &SuccChain = *BlockToChain[Succ];
2309 // Don't split chains, either this chain or the successor's chain.
2310 if (&Chain == &SuccChain) {
2311 LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2312 << getBlockName(Succ) << " (chain conflict)\n");
2313 continue;
2314 }
2315
2316 auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2317 if (LoopBlockSet.count(Succ)) {
2318 LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2319 << getBlockName(Succ) << " (" << SuccProb << ")\n");
2320 HasLoopingSucc = true;
2321 continue;
2322 }
2323
2324 unsigned SuccLoopDepth = 0;
2325 if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2326 SuccLoopDepth = ExitLoop->getLoopDepth();
2327 if (ExitLoop->contains(&L))
2328 BlocksExitingToOuterLoop.insert(MBB);
2329 }
2330
2331 BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2332 LLVM_DEBUG(
2333 dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2334 << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("
2335 << printBlockFreq(MBFI->getMBFI(), ExitEdgeFreq) << ")\n");
2336 // Note that we bias this toward an existing layout successor to retain
2337 // incoming order in the absence of better information. The exit must have
2338 // a frequency higher than the current exit before we consider breaking
2339 // the layout.
2340 BranchProbability Bias(100 - ExitBlockBias, 100);
2341 if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2342 ExitEdgeFreq > BestExitEdgeFreq ||
2343 (MBB->isLayoutSuccessor(Succ) &&
2344 !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2345 BestExitEdgeFreq = ExitEdgeFreq;
2346 ExitingBB = MBB;
2347 }
2348 }
2349
2350 if (!HasLoopingSucc) {
2351 // Restore the old exiting state, no viable looping successor was found.
2352 ExitingBB = OldExitingBB;
2353 BestExitEdgeFreq = OldBestExitEdgeFreq;
2354 }
2355 }
2356 // Without a candidate exiting block or with only a single block in the
2357 // loop, just use the loop header to layout the loop.
2358 if (!ExitingBB) {
2359 LLVM_DEBUG(
2360 dbgs() << " No other candidate exit blocks, using loop header\n");
2361 return nullptr;
2362 }
2363 if (L.getNumBlocks() == 1) {
2364 LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2365 return nullptr;
2366 }
2367
2368 // Also, if we have exit blocks which lead to outer loops but didn't select
2369 // one of them as the exiting block we are rotating toward, disable loop
2370 // rotation altogether.
2371 if (!BlocksExitingToOuterLoop.empty() &&
2372 !BlocksExitingToOuterLoop.count(ExitingBB))
2373 return nullptr;
2374
2375 LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2376 << "\n");
2377 ExitFreq = BestExitEdgeFreq;
2378 return ExitingBB;
2379}
2380
2381/// Check if there is a fallthrough to loop header Top.
2382///
2383/// 1. Look for a Pred that can be layout before Top.
2384/// 2. Check if Top is the most possible successor of Pred.
2385bool MachineBlockPlacement::hasViableTopFallthrough(
2386 const MachineBasicBlock *Top, const BlockFilterSet &LoopBlockSet) {
2387 for (MachineBasicBlock *Pred : Top->predecessors()) {
2388 BlockChain *PredChain = BlockToChain[Pred];
2389 if (!LoopBlockSet.count(Pred) &&
2390 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2391 // Found a Pred block can be placed before Top.
2392 // Check if Top is the best successor of Pred.
2393 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2394 bool TopOK = true;
2395 for (MachineBasicBlock *Succ : Pred->successors()) {
2396 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2397 BlockChain *SuccChain = BlockToChain[Succ];
2398 // Check if Succ can be placed after Pred.
2399 // Succ should not be in any chain, or it is the head of some chain.
2400 if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2401 TopOK = false;
2402 break;
2403 }
2404 }
2405 if (TopOK)
2406 return true;
2407 }
2408 }
2409 return false;
2410}
2411
2412/// Attempt to rotate an exiting block to the bottom of the loop.
2413///
2414/// Once we have built a chain, try to rotate it to line up the hot exit block
2415/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2416/// branches. For example, if the loop has fallthrough into its header and out
2417/// of its bottom already, don't rotate it.
2418void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2419 const MachineBasicBlock *ExitingBB,
2420 BlockFrequency ExitFreq,
2421 const BlockFilterSet &LoopBlockSet) {
2422 if (!ExitingBB)
2423 return;
2424
2425 MachineBasicBlock *Top = *LoopChain.begin();
2426 MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2427
2428 // If ExitingBB is already the last one in a chain then nothing to do.
2429 if (Bottom == ExitingBB)
2430 return;
2431
2432 // The entry block should always be the first BB in a function.
2433 if (Top->isEntryBlock())
2434 return;
2435
2436 bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2437
2438 // If the header has viable fallthrough, check whether the current loop
2439 // bottom is a viable exiting block. If so, bail out as rotating will
2440 // introduce an unnecessary branch.
2441 if (ViableTopFallthrough) {
2442 for (MachineBasicBlock *Succ : Bottom->successors()) {
2443 BlockChain *SuccChain = BlockToChain[Succ];
2444 if (!LoopBlockSet.count(Succ) &&
2445 (!SuccChain || Succ == *SuccChain->begin()))
2446 return;
2447 }
2448
2449 // Rotate will destroy the top fallthrough, we need to ensure the new exit
2450 // frequency is larger than top fallthrough.
2451 BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2452 if (FallThrough2Top >= ExitFreq)
2453 return;
2454 }
2455
2456 BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2457 if (ExitIt == LoopChain.end())
2458 return;
2459
2460 // Rotating a loop exit to the bottom when there is a fallthrough to top
2461 // trades the entry fallthrough for an exit fallthrough.
2462 // If there is no bottom->top edge, but the chosen exit block does have
2463 // a fallthrough, we break that fallthrough for nothing in return.
2464
2465 // Let's consider an example. We have a built chain of basic blocks
2466 // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2467 // By doing a rotation we get
2468 // Bk+1, ..., Bn, B1, ..., Bk
2469 // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2470 // If we had a fallthrough Bk -> Bk+1 it is broken now.
2471 // It might be compensated by fallthrough Bn -> B1.
2472 // So we have a condition to avoid creation of extra branch by loop rotation.
2473 // All below must be true to avoid loop rotation:
2474 // If there is a fallthrough to top (B1)
2475 // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2476 // There is no fallthrough from bottom (Bn) to top (B1).
2477 // Please note that there is no exit fallthrough from Bn because we checked it
2478 // above.
2479 if (ViableTopFallthrough) {
2480 assert(std::next(ExitIt) != LoopChain.end() &&
2481 "Exit should not be last BB");
2482 MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2483 if (ExitingBB->isSuccessor(NextBlockInChain))
2484 if (!Bottom->isSuccessor(Top))
2485 return;
2486 }
2487
2488 LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2489 << " at bottom\n");
2490 std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2491}
2492
2493/// Attempt to rotate a loop based on profile data to reduce branch cost.
2494///
2495/// With profile data, we can determine the cost in terms of missed fall through
2496/// opportunities when rotating a loop chain and select the best rotation.
2497/// Basically, there are three kinds of cost to consider for each rotation:
2498/// 1. The possibly missed fall through edge (if it exists) from BB out of
2499/// the loop to the loop header.
2500/// 2. The possibly missed fall through edges (if they exist) from the loop
2501/// exits to BB out of the loop.
2502/// 3. The missed fall through edge (if it exists) from the last BB to the
2503/// first BB in the loop chain.
2504/// Therefore, the cost for a given rotation is the sum of costs listed above.
2505/// We select the best rotation with the smallest cost.
2506void MachineBlockPlacement::rotateLoopWithProfile(
2507 BlockChain &LoopChain, const MachineLoop &L,
2508 const BlockFilterSet &LoopBlockSet) {
2509 auto RotationPos = LoopChain.end();
2510 MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2511
2512 // The entry block should always be the first BB in a function.
2513 if (ChainHeaderBB->isEntryBlock())
2514 return;
2515
2516 BlockFrequency SmallestRotationCost = BlockFrequency::max();
2517
2518 // A utility lambda that scales up a block frequency by dividing it by a
2519 // branch probability which is the reciprocal of the scale.
2520 auto ScaleBlockFrequency = [](BlockFrequency Freq,
2521 unsigned Scale) -> BlockFrequency {
2522 if (Scale == 0)
2523 return BlockFrequency(0);
2524 // Use operator / between BlockFrequency and BranchProbability to implement
2525 // saturating multiplication.
2526 return Freq / BranchProbability(1, Scale);
2527 };
2528
2529 // Compute the cost of the missed fall-through edge to the loop header if the
2530 // chain head is not the loop header. As we only consider natural loops with
2531 // single header, this computation can be done only once.
2532 BlockFrequency HeaderFallThroughCost(0);
2533 for (auto *Pred : ChainHeaderBB->predecessors()) {
2534 BlockChain *PredChain = BlockToChain[Pred];
2535 if (!LoopBlockSet.count(Pred) &&
2536 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2537 auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2538 MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2539 auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2540 // If the predecessor has only an unconditional jump to the header, we
2541 // need to consider the cost of this jump.
2542 if (Pred->succ_size() == 1)
2543 FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2544 HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2545 }
2546 }
2547
2548 // Here we collect all exit blocks in the loop, and for each exit we find out
2549 // its hottest exit edge. For each loop rotation, we define the loop exit cost
2550 // as the sum of frequencies of exit edges we collect here, excluding the exit
2551 // edge from the tail of the loop chain.
2553 for (auto *BB : LoopChain) {
2554 auto LargestExitEdgeProb = BranchProbability::getZero();
2555 for (auto *Succ : BB->successors()) {
2556 BlockChain *SuccChain = BlockToChain[Succ];
2557 if (!LoopBlockSet.count(Succ) &&
2558 (!SuccChain || Succ == *SuccChain->begin())) {
2559 auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2560 LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2561 }
2562 }
2563 if (LargestExitEdgeProb > BranchProbability::getZero()) {
2564 auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2565 ExitsWithFreq.emplace_back(BB, ExitFreq);
2566 }
2567 }
2568
2569 // In this loop we iterate every block in the loop chain and calculate the
2570 // cost assuming the block is the head of the loop chain. When the loop ends,
2571 // we should have found the best candidate as the loop chain's head.
2572 for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2573 EndIter = LoopChain.end();
2574 Iter != EndIter; Iter++, TailIter++) {
2575 // TailIter is used to track the tail of the loop chain if the block we are
2576 // checking (pointed by Iter) is the head of the chain.
2577 if (TailIter == LoopChain.end())
2578 TailIter = LoopChain.begin();
2579
2580 auto TailBB = *TailIter;
2581
2582 // Calculate the cost by putting this BB to the top.
2583 BlockFrequency Cost = BlockFrequency(0);
2584
2585 // If the current BB is the loop header, we need to take into account the
2586 // cost of the missed fall through edge from outside of the loop to the
2587 // header.
2588 if (Iter != LoopChain.begin())
2589 Cost += HeaderFallThroughCost;
2590
2591 // Collect the loop exit cost by summing up frequencies of all exit edges
2592 // except the one from the chain tail.
2593 for (auto &ExitWithFreq : ExitsWithFreq)
2594 if (TailBB != ExitWithFreq.first)
2595 Cost += ExitWithFreq.second;
2596
2597 // The cost of breaking the once fall-through edge from the tail to the top
2598 // of the loop chain. Here we need to consider three cases:
2599 // 1. If the tail node has only one successor, then we will get an
2600 // additional jmp instruction. So the cost here is (MisfetchCost +
2601 // JumpInstCost) * tail node frequency.
2602 // 2. If the tail node has two successors, then we may still get an
2603 // additional jmp instruction if the layout successor after the loop
2604 // chain is not its CFG successor. Note that the more frequently executed
2605 // jmp instruction will be put ahead of the other one. Assume the
2606 // frequency of those two branches are x and y, where x is the frequency
2607 // of the edge to the chain head, then the cost will be
2608 // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2609 // 3. If the tail node has more than two successors (this rarely happens),
2610 // we won't consider any additional cost.
2611 if (TailBB->isSuccessor(*Iter)) {
2612 auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2613 if (TailBB->succ_size() == 1)
2614 Cost += ScaleBlockFrequency(TailBBFreq, MisfetchCost + JumpInstCost);
2615 else if (TailBB->succ_size() == 2) {
2616 auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2617 auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2618 auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2619 ? TailBBFreq * TailToHeadProb.getCompl()
2620 : TailToHeadFreq;
2621 Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2622 ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2623 }
2624 }
2625
2626 LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2627 << getBlockName(*Iter) << " to the top: "
2628 << printBlockFreq(MBFI->getMBFI(), Cost) << "\n");
2629
2630 if (Cost < SmallestRotationCost) {
2631 SmallestRotationCost = Cost;
2632 RotationPos = Iter;
2633 }
2634 }
2635
2636 if (RotationPos != LoopChain.end()) {
2637 LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2638 << " to the top\n");
2639 std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2640 }
2641}
2642
2643/// Collect blocks in the given loop that are to be placed.
2644///
2645/// When profile data is available, exclude cold blocks from the returned set;
2646/// otherwise, collect all blocks in the loop.
2647MachineBlockPlacement::BlockFilterSet
2648MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2649 // Collect the blocks in a set ordered by block number, as this gives the same
2650 // order as they appear in the function.
2651 struct MBBCompare {
2652 bool operator()(const MachineBasicBlock *X,
2653 const MachineBasicBlock *Y) const {
2654 return X->getNumber() < Y->getNumber();
2655 }
2656 };
2657 std::set<const MachineBasicBlock *, MBBCompare> LoopBlockSet;
2658
2659 // Filter cold blocks off from LoopBlockSet when profile data is available.
2660 // Collect the sum of frequencies of incoming edges to the loop header from
2661 // outside. If we treat the loop as a super block, this is the frequency of
2662 // the loop. Then for each block in the loop, we calculate the ratio between
2663 // its frequency and the frequency of the loop block. When it is too small,
2664 // don't add it to the loop chain. If there are outer loops, then this block
2665 // will be merged into the first outer loop chain for which this block is not
2666 // cold anymore. This needs precise profile data and we only do this when
2667 // profile data is available.
2668 if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
2669 BlockFrequency LoopFreq(0);
2670 for (auto *LoopPred : L.getHeader()->predecessors())
2671 if (!L.contains(LoopPred))
2672 LoopFreq += MBFI->getBlockFreq(LoopPred) *
2673 MBPI->getEdgeProbability(LoopPred, L.getHeader());
2674
2675 for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2676 if (LoopBlockSet.count(LoopBB))
2677 continue;
2678 auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2679 if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2680 continue;
2681 BlockChain *Chain = BlockToChain[LoopBB];
2682 for (MachineBasicBlock *ChainBB : *Chain)
2683 LoopBlockSet.insert(ChainBB);
2684 }
2685 } else
2686 LoopBlockSet.insert(L.block_begin(), L.block_end());
2687
2688 // Copy the blocks into a BlockFilterSet, as iterating it is faster than
2689 // std::set. We will only remove blocks and never insert them, which will
2690 // preserve the ordering.
2691 BlockFilterSet Ret(LoopBlockSet.begin(), LoopBlockSet.end());
2692 return Ret;
2693}
2694
2695/// Forms basic block chains from the natural loop structures.
2696///
2697/// These chains are designed to preserve the existing *structure* of the code
2698/// as much as possible. We can then stitch the chains together in a way which
2699/// both preserves the topological structure and minimizes taken conditional
2700/// branches.
2701void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2702 // First recurse through any nested loops, building chains for those inner
2703 // loops.
2704 for (const MachineLoop *InnerLoop : L)
2705 buildLoopChains(*InnerLoop);
2706
2707 assert(BlockWorkList.empty() &&
2708 "BlockWorkList not empty when starting to build loop chains.");
2709 assert(EHPadWorkList.empty() &&
2710 "EHPadWorkList not empty when starting to build loop chains.");
2711 BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2712
2713 // Check if we have profile data for this function. If yes, we will rotate
2714 // this loop by modeling costs more precisely which requires the profile data
2715 // for better layout.
2716 bool RotateLoopWithProfile =
2718 (PreciseRotationCost && F->getFunction().hasProfileData());
2719
2720 // First check to see if there is an obviously preferable top block for the
2721 // loop. This will default to the header, but may end up as one of the
2722 // predecessors to the header if there is one which will result in strictly
2723 // fewer branches in the loop body.
2724 MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2725
2726 // If we selected just the header for the loop top, look for a potentially
2727 // profitable exit block in the event that rotating the loop can eliminate
2728 // branches by placing an exit edge at the bottom.
2729 //
2730 // Loops are processed innermost to uttermost, make sure we clear
2731 // PreferredLoopExit before processing a new loop.
2732 PreferredLoopExit = nullptr;
2733 BlockFrequency ExitFreq;
2734 if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2735 PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2736
2737 BlockChain &LoopChain = *BlockToChain[LoopTop];
2738
2739 // FIXME: This is a really lame way of walking the chains in the loop: we
2740 // walk the blocks, and use a set to prevent visiting a particular chain
2741 // twice.
2742 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2743 assert(LoopChain.UnscheduledPredecessors == 0 &&
2744 "LoopChain should not have unscheduled predecessors.");
2745 UpdatedPreds.insert(&LoopChain);
2746
2747 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2748 fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2749
2750 buildChain(LoopTop, LoopChain, &LoopBlockSet);
2751
2752 if (RotateLoopWithProfile)
2753 rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2754 else
2755 rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2756
2757 LLVM_DEBUG({
2758 // Crash at the end so we get all of the debugging output first.
2759 bool BadLoop = false;
2760 if (LoopChain.UnscheduledPredecessors) {
2761 BadLoop = true;
2762 dbgs() << "Loop chain contains a block without its preds placed!\n"
2763 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2764 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2765 }
2766 for (MachineBasicBlock *ChainBB : LoopChain) {
2767 dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2768 if (!LoopBlockSet.remove(ChainBB)) {
2769 // We don't mark the loop as bad here because there are real situations
2770 // where this can occur. For example, with an unanalyzable fallthrough
2771 // from a loop block to a non-loop block or vice versa.
2772 dbgs() << "Loop chain contains a block not contained by the loop!\n"
2773 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2774 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2775 << " Bad block: " << getBlockName(ChainBB) << "\n";
2776 }
2777 }
2778
2779 if (!LoopBlockSet.empty()) {
2780 BadLoop = true;
2781 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2782 dbgs() << "Loop contains blocks never placed into a chain!\n"
2783 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2784 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2785 << " Bad block: " << getBlockName(LoopBB) << "\n";
2786 }
2787 assert(!BadLoop && "Detected problems with the placement of this loop.");
2788 });
2789
2790 BlockWorkList.clear();
2791 EHPadWorkList.clear();
2792}
2793
2794void MachineBlockPlacement::buildCFGChains() {
2795 // Ensure that every BB in the function has an associated chain to simplify
2796 // the assumptions of the remaining algorithm.
2797 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2798 for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2799 ++FI) {
2800 MachineBasicBlock *BB = &*FI;
2801 BlockChain *Chain =
2802 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2803 // Also, merge any blocks which we cannot reason about and must preserve
2804 // the exact fallthrough behavior for.
2805 while (true) {
2806 Cond.clear();
2807 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2808 if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2809 break;
2810
2811 MachineFunction::iterator NextFI = std::next(FI);
2812 MachineBasicBlock *NextBB = &*NextFI;
2813 // Ensure that the layout successor is a viable block, as we know that
2814 // fallthrough is a possibility.
2815 assert(NextFI != FE && "Can't fallthrough past the last block.");
2816 LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2817 << getBlockName(BB) << " -> " << getBlockName(NextBB)
2818 << "\n");
2819 Chain->merge(NextBB, nullptr);
2820#ifndef NDEBUG
2821 BlocksWithUnanalyzableExits.insert(&*BB);
2822#endif
2823 FI = NextFI;
2824 BB = NextBB;
2825 }
2826 }
2827
2828 // Build any loop-based chains.
2829 PreferredLoopExit = nullptr;
2830 for (MachineLoop *L : *MLI)
2831 buildLoopChains(*L);
2832
2833 assert(BlockWorkList.empty() &&
2834 "BlockWorkList should be empty before building final chain.");
2835 assert(EHPadWorkList.empty() &&
2836 "EHPadWorkList should be empty before building final chain.");
2837
2838 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2839 for (MachineBasicBlock &MBB : *F)
2840 fillWorkLists(&MBB, UpdatedPreds);
2841
2842 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2843 buildChain(&F->front(), FunctionChain);
2844
2845#ifndef NDEBUG
2846 using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2847#endif
2848 LLVM_DEBUG({
2849 // Crash at the end so we get all of the debugging output first.
2850 bool BadFunc = false;
2851 FunctionBlockSetType FunctionBlockSet;
2852 for (MachineBasicBlock &MBB : *F)
2853 FunctionBlockSet.insert(&MBB);
2854
2855 for (MachineBasicBlock *ChainBB : FunctionChain)
2856 if (!FunctionBlockSet.erase(ChainBB)) {
2857 BadFunc = true;
2858 dbgs() << "Function chain contains a block not in the function!\n"
2859 << " Bad block: " << getBlockName(ChainBB) << "\n";
2860 }
2861
2862 if (!FunctionBlockSet.empty()) {
2863 BadFunc = true;
2864 for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2865 dbgs() << "Function contains blocks never placed into a chain!\n"
2866 << " Bad block: " << getBlockName(RemainingBB) << "\n";
2867 }
2868 assert(!BadFunc && "Detected problems with the block placement.");
2869 });
2870
2871 // Remember original layout ordering, so we can update terminators after
2872 // reordering to point to the original layout successor.
2873 SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
2874 F->getNumBlockIDs());
2875 {
2876 MachineBasicBlock *LastMBB = nullptr;
2877 for (auto &MBB : *F) {
2878 if (LastMBB != nullptr)
2879 OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
2880 LastMBB = &MBB;
2881 }
2882 OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
2883 }
2884
2885 // Splice the blocks into place.
2886 MachineFunction::iterator InsertPos = F->begin();
2887 LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2888 for (MachineBasicBlock *ChainBB : FunctionChain) {
2889 LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2890 : " ... ")
2891 << getBlockName(ChainBB) << "\n");
2892 if (InsertPos != MachineFunction::iterator(ChainBB))
2893 F->splice(InsertPos, ChainBB);
2894 else
2895 ++InsertPos;
2896
2897 // Update the terminator of the previous block.
2898 if (ChainBB == *FunctionChain.begin())
2899 continue;
2900 MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2901
2902 // FIXME: It would be awesome of updateTerminator would just return rather
2903 // than assert when the branch cannot be analyzed in order to remove this
2904 // boiler plate.
2905 Cond.clear();
2906 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2907
2908#ifndef NDEBUG
2909 if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2910 // Given the exact block placement we chose, we may actually not _need_ to
2911 // be able to edit PrevBB's terminator sequence, but not being _able_ to
2912 // do that at this point is a bug.
2913 assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2914 !PrevBB->canFallThrough()) &&
2915 "Unexpected block with un-analyzable fallthrough!");
2916 Cond.clear();
2917 TBB = FBB = nullptr;
2918 }
2919#endif
2920
2921 // The "PrevBB" is not yet updated to reflect current code layout, so,
2922 // o. it may fall-through to a block without explicit "goto" instruction
2923 // before layout, and no longer fall-through it after layout; or
2924 // o. just opposite.
2925 //
2926 // analyzeBranch() may return erroneous value for FBB when these two
2927 // situations take place. For the first scenario FBB is mistakenly set NULL;
2928 // for the 2nd scenario, the FBB, which is expected to be NULL, is
2929 // mistakenly pointing to "*BI".
2930 // Thus, if the future change needs to use FBB before the layout is set, it
2931 // has to correct FBB first by using the code similar to the following:
2932 //
2933 // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2934 // PrevBB->updateTerminator();
2935 // Cond.clear();
2936 // TBB = FBB = nullptr;
2937 // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2938 // // FIXME: This should never take place.
2939 // TBB = FBB = nullptr;
2940 // }
2941 // }
2942 if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2943 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2944 }
2945 }
2946
2947 // Fixup the last block.
2948 Cond.clear();
2949 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2950 if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
2951 MachineBasicBlock *PrevBB = &F->back();
2952 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2953 }
2954
2955 BlockWorkList.clear();
2956 EHPadWorkList.clear();
2957}
2958
2959void MachineBlockPlacement::optimizeBranches() {
2960 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2962
2963 // Now that all the basic blocks in the chain have the proper layout,
2964 // make a final call to analyzeBranch with AllowModify set.
2965 // Indeed, the target may be able to optimize the branches in a way we
2966 // cannot because all branches may not be analyzable.
2967 // E.g., the target may be able to remove an unconditional branch to
2968 // a fallthrough when it occurs after predicated terminators.
2969 for (MachineBasicBlock *ChainBB : FunctionChain) {
2970 Cond.clear();
2971 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2972 if (TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true))
2973 continue;
2974 if (!TBB || !FBB || Cond.empty())
2975 continue;
2976 // If we are optimizing for size we do not consider the runtime performance.
2977 // Instead, we retain the original branch condition so we have more uniform
2978 // instructions which will benefit ICF.
2979 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()))
2980 continue;
2981 // If ChainBB has a two-way branch, try to re-order the branches
2982 // such that we branch to the successor with higher probability first.
2983 if (MBPI->getEdgeProbability(ChainBB, TBB) >=
2984 MBPI->getEdgeProbability(ChainBB, FBB))
2985 continue;
2987 continue;
2988 LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2989 << getBlockName(ChainBB) << "\n");
2990 LLVM_DEBUG(dbgs() << " " << getBlockName(TBB) << " < " << getBlockName(FBB)
2991 << "\n");
2992 auto Dl = ChainBB->findBranchDebugLoc();
2993 TII->removeBranch(*ChainBB);
2994 TII->insertBranch(*ChainBB, FBB, TBB, Cond, Dl);
2995 }
2996}
2997
2998void MachineBlockPlacement::alignBlocks() {
2999 // Walk through the backedges of the function now that we have fully laid out
3000 // the basic blocks and align the destination of each backedge. We don't rely
3001 // exclusively on the loop info here so that we can align backedges in
3002 // unnatural CFGs and backedges that were introduced purely because of the
3003 // loop rotations done during this layout pass.
3005 if (F->getFunction().hasMinSize() ||
3006 (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
3007 return;
3008 }
3009
3010 BlockChain &FunctionChain = *BlockToChain[&F->front()];
3011 // Empty chain.
3012 if (FunctionChain.begin() == FunctionChain.end())
3013 return;
3014
3015 const BranchProbability ColdProb(1, 5); // 20%
3016 BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
3017 BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
3018 for (MachineBasicBlock *ChainBB : FunctionChain) {
3019 if (ChainBB == *FunctionChain.begin())
3020 continue;
3021
3022 // Don't align non-looping basic blocks. These are unlikely to execute
3023 // enough times to matter in practice. Note that we'll still handle
3024 // unnatural CFGs inside of a natural outer loop (the common case) and
3025 // rotated loops.
3026 MachineLoop *L = MLI->getLoopFor(ChainBB);
3027 if (!L)
3028 continue;
3029
3030 const Align TLIAlign = TLI->getPrefLoopAlignment(L);
3031 unsigned MDAlign = 1;
3032 MDNode *LoopID = L->getLoopID();
3033 if (LoopID) {
3034 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
3035 MDNode *MD = dyn_cast<MDNode>(MDO);
3036 if (MD == nullptr)
3037 continue;
3038 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
3039 if (S == nullptr)
3040 continue;
3041 if (S->getString() == "llvm.loop.align") {
3042 assert(MD->getNumOperands() == 2 &&
3043 "per-loop align metadata should have two operands.");
3044 MDAlign =
3045 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
3046 assert(MDAlign >= 1 && "per-loop align value must be positive.");
3047 }
3048 }
3049 }
3050
3051 // Use max of the TLIAlign and MDAlign
3052 const Align LoopAlign = std::max(TLIAlign, Align(MDAlign));
3053 if (LoopAlign == 1)
3054 continue; // Don't care about loop alignment.
3055
3056 // If the block is cold relative to the function entry don't waste space
3057 // aligning it.
3058 BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
3059 if (Freq < WeightedEntryFreq)
3060 continue;
3061
3062 // If the block is cold relative to its loop header, don't align it
3063 // regardless of what edges into the block exist.
3064 MachineBasicBlock *LoopHeader = L->getHeader();
3065 BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
3066 if (Freq < (LoopHeaderFreq * ColdProb))
3067 continue;
3068
3069 // If the global profiles indicates so, don't align it.
3070 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
3071 !TLI->alignLoopsWithOptSize())
3072 continue;
3073
3074 // Check for the existence of a non-layout predecessor which would benefit
3075 // from aligning this block.
3076 MachineBasicBlock *LayoutPred =
3077 &*std::prev(MachineFunction::iterator(ChainBB));
3078
3079 auto DetermineMaxAlignmentPadding = [&]() {
3080 // Set the maximum bytes allowed to be emitted for alignment.
3081 unsigned MaxBytes;
3082 if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
3084 else
3085 MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
3086 ChainBB->setMaxBytesForAlignment(MaxBytes);
3087 };
3088
3089 // Force alignment if all the predecessors are jumps. We already checked
3090 // that the block isn't cold above.
3091 if (!LayoutPred->isSuccessor(ChainBB)) {
3092 ChainBB->setAlignment(LoopAlign);
3093 DetermineMaxAlignmentPadding();
3094 continue;
3095 }
3096
3097 // Align this block if the layout predecessor's edge into this block is
3098 // cold relative to the block. When this is true, other predecessors make up
3099 // all of the hot entries into the block and thus alignment is likely to be
3100 // important.
3101 BranchProbability LayoutProb =
3102 MBPI->getEdgeProbability(LayoutPred, ChainBB);
3103 BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
3104 if (LayoutEdgeFreq <= (Freq * ColdProb)) {
3105 ChainBB->setAlignment(LoopAlign);
3106 DetermineMaxAlignmentPadding();
3107 }
3108 }
3109
3110 const bool HasMaxBytesOverride =
3111 MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
3112
3113 if (AlignAllBlock)
3114 // Align all of the blocks in the function to a specific alignment.
3115 for (MachineBasicBlock &MBB : *F) {
3116 if (HasMaxBytesOverride)
3119 else
3121 }
3122 else if (AlignAllNonFallThruBlocks) {
3123 // Align all of the blocks that have no fall-through predecessors to a
3124 // specific alignment.
3125 for (auto MBI = std::next(F->begin()), MBE = F->end(); MBI != MBE; ++MBI) {
3126 auto LayoutPred = std::prev(MBI);
3127 if (!LayoutPred->isSuccessor(&*MBI)) {
3128 if (HasMaxBytesOverride)
3129 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
3131 else
3132 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
3133 }
3134 }
3135 }
3136}
3137
3138/// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
3139/// it was duplicated into its chain predecessor and removed.
3140/// \p BB - Basic block that may be duplicated.
3141///
3142/// \p LPred - Chosen layout predecessor of \p BB.
3143/// Updated to be the chain end if LPred is removed.
3144/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3145/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3146/// Used to identify which blocks to update predecessor
3147/// counts.
3148/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3149/// chosen in the given order due to unnatural CFG
3150/// only needed if \p BB is removed and
3151/// \p PrevUnplacedBlockIt pointed to \p BB.
3152/// @return true if \p BB was removed.
3153bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
3154 MachineBasicBlock *BB, MachineBasicBlock *&LPred,
3155 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
3156 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3157 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt) {
3158 bool Removed, DuplicatedToLPred;
3159 bool DuplicatedToOriginalLPred;
3160 Removed = maybeTailDuplicateBlock(
3161 BB, LPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3162 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3163 if (!Removed)
3164 return false;
3165 DuplicatedToOriginalLPred = DuplicatedToLPred;
3166 // Iteratively try to duplicate again. It can happen that a block that is
3167 // duplicated into is still small enough to be duplicated again.
3168 // No need to call markBlockSuccessors in this case, as the blocks being
3169 // duplicated from here on are already scheduled.
3170 while (DuplicatedToLPred && Removed) {
3171 MachineBasicBlock *DupBB, *DupPred;
3172 // The removal callback causes Chain.end() to be updated when a block is
3173 // removed. On the first pass through the loop, the chain end should be the
3174 // same as it was on function entry. On subsequent passes, because we are
3175 // duplicating the block at the end of the chain, if it is removed the
3176 // chain will have shrunk by one block.
3177 BlockChain::iterator ChainEnd = Chain.end();
3178 DupBB = *(--ChainEnd);
3179 // Now try to duplicate again.
3180 if (ChainEnd == Chain.begin())
3181 break;
3182 DupPred = *std::prev(ChainEnd);
3183 Removed = maybeTailDuplicateBlock(
3184 DupBB, DupPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3185 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3186 }
3187 // If BB was duplicated into LPred, it is now scheduled. But because it was
3188 // removed, markChainSuccessors won't be called for its chain. Instead we
3189 // call markBlockSuccessors for LPred to achieve the same effect. This must go
3190 // at the end because repeating the tail duplication can increase the number
3191 // of unscheduled predecessors.
3192 LPred = *std::prev(Chain.end());
3193 if (DuplicatedToOriginalLPred)
3194 markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
3195 return true;
3196}
3197
3198/// Tail duplicate \p BB into (some) predecessors if profitable.
3199/// \p BB - Basic block that may be duplicated
3200/// \p LPred - Chosen layout predecessor of \p BB
3201/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3202/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3203/// Used to identify which blocks to update predecessor
3204/// counts.
3205/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3206/// chosen in the given order due to unnatural CFG
3207/// only needed if \p BB is removed and
3208/// \p PrevUnplacedBlockIt pointed to \p BB.
3209/// \p DuplicatedToLPred - True if the block was duplicated into LPred.
3210/// \return - True if the block was duplicated into all preds and removed.
3211bool MachineBlockPlacement::maybeTailDuplicateBlock(
3212 MachineBasicBlock *BB, MachineBasicBlock *LPred, BlockChain &Chain,
3213 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3214 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
3215 bool &DuplicatedToLPred) {
3216 DuplicatedToLPred = false;
3217 if (!shouldTailDuplicate(BB))
3218 return false;
3219
3220 LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
3221 << "\n");
3222
3223 // This has to be a callback because none of it can be done after
3224 // BB is deleted.
3225 bool Removed = false;
3226 auto RemovalCallback = [&](MachineBasicBlock *RemBB) {
3227 // Signal to outer function
3228 Removed = true;
3229
3230 // Remove from the Chain and Chain Map
3231 if (auto It = BlockToChain.find(RemBB); It != BlockToChain.end()) {
3232 It->second->remove(RemBB);
3233 BlockToChain.erase(It);
3234 }
3235
3236 // Handle the unplaced block iterator
3237 if (&(*PrevUnplacedBlockIt) == RemBB) {
3238 PrevUnplacedBlockIt++;
3239 }
3240
3241 // Handle the Work Lists
3242 if (RemBB->isEHPad()) {
3243 llvm::erase(EHPadWorkList, RemBB);
3244 } else {
3245 llvm::erase(BlockWorkList, RemBB);
3246 }
3247
3248 // Handle the filter set
3249 if (BlockFilter) {
3250 auto It = llvm::find(*BlockFilter, RemBB);
3251 // Erase RemBB from BlockFilter, and keep PrevUnplacedBlockInFilterIt
3252 // pointing to the same element as before.
3253 if (It != BlockFilter->end()) {
3254 if (It < PrevUnplacedBlockInFilterIt) {
3255 const MachineBasicBlock *PrevBB = *PrevUnplacedBlockInFilterIt;
3256 // BlockFilter is a SmallVector so all elements after RemBB are
3257 // shifted to the front by 1 after its deletion.
3258 auto Distance = PrevUnplacedBlockInFilterIt - It - 1;
3259 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It) + Distance;
3260 assert(*PrevUnplacedBlockInFilterIt == PrevBB);
3261 (void)PrevBB;
3262 } else if (It == PrevUnplacedBlockInFilterIt)
3263 // The block pointed by PrevUnplacedBlockInFilterIt is erased, we
3264 // have to set it to the next element.
3265 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It);
3266 else
3267 BlockFilter->erase(It);
3268 }
3269 }
3270
3271 // Remove the block from loop info.
3272 MLI->removeBlock(RemBB);
3273 if (RemBB == PreferredLoopExit)
3274 PreferredLoopExit = nullptr;
3275
3276 LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: " << getBlockName(RemBB)
3277 << "\n");
3278 };
3279 auto RemovalCallbackRef =
3280 function_ref<void(MachineBasicBlock *)>(RemovalCallback);
3281
3282 SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
3283 bool IsSimple = TailDup.isSimpleBB(BB);
3284 SmallVector<MachineBasicBlock *, 8> CandidatePreds;
3285 SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
3286 if (F->getFunction().hasProfileData()) {
3287 // We can do partial duplication with precise profile information.
3288 findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
3289 if (CandidatePreds.size() == 0)
3290 return false;
3291 if (CandidatePreds.size() < BB->pred_size())
3292 CandidatePtr = &CandidatePreds;
3293 }
3294 TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
3295 &RemovalCallbackRef, CandidatePtr);
3296
3297 // Update UnscheduledPredecessors to reflect tail-duplication.
3298 DuplicatedToLPred = false;
3299 for (MachineBasicBlock *Pred : DuplicatedPreds) {
3300 // We're only looking for unscheduled predecessors that match the filter.
3301 BlockChain *PredChain = BlockToChain[Pred];
3302 if (Pred == LPred)
3303 DuplicatedToLPred = true;
3304 if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred)) ||
3305 PredChain == &Chain)
3306 continue;
3307 for (MachineBasicBlock *NewSucc : Pred->successors()) {
3308 if (BlockFilter && !BlockFilter->count(NewSucc))
3309 continue;
3310 BlockChain *NewChain = BlockToChain[NewSucc];
3311 if (NewChain != &Chain && NewChain != PredChain)
3312 NewChain->UnscheduledPredecessors++;
3313 }
3314 }
3315 return Removed;
3316}
3317
3318// Count the number of actual machine instructions.
3320 uint64_t InstrCount = 0;
3321 for (MachineInstr &MI : *MBB) {
3322 if (!MI.isPHI() && !MI.isMetaInstruction())
3323 InstrCount += 1;
3324 }
3325 return InstrCount;
3326}
3327
3328// The size cost of duplication is the instruction size of the duplicated block.
3329// So we should scale the threshold accordingly. But the instruction size is not
3330// available on all targets, so we use the number of instructions instead.
3331BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3332 return BlockFrequency(DupThreshold.getFrequency() * countMBBInstruction(BB));
3333}
3334
3335// Returns true if BB is Pred's best successor.
3336bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3337 MachineBasicBlock *Pred,
3338 BlockFilterSet *BlockFilter) {
3339 if (BB == Pred)
3340 return false;
3341 if (BlockFilter && !BlockFilter->count(Pred))
3342 return false;
3343 BlockChain *PredChain = BlockToChain[Pred];
3344 if (PredChain && (Pred != *std::prev(PredChain->end())))
3345 return false;
3346
3347 // Find the successor with largest probability excluding BB.
3348 BranchProbability BestProb = BranchProbability::getZero();
3349 for (MachineBasicBlock *Succ : Pred->successors())
3350 if (Succ != BB) {
3351 if (BlockFilter && !BlockFilter->count(Succ))
3352 continue;
3353 BlockChain *SuccChain = BlockToChain[Succ];
3354 if (SuccChain && (Succ != *SuccChain->begin()))
3355 continue;
3356 BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
3357 if (SuccProb > BestProb)
3358 BestProb = SuccProb;
3359 }
3360
3361 BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
3362 if (BBProb <= BestProb)
3363 return false;
3364
3365 // Compute the number of reduced taken branches if Pred falls through to BB
3366 // instead of another successor. Then compare it with threshold.
3367 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3368 BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3369 return Gain > scaleThreshold(BB);
3370}
3371
3372// Find out the predecessors of BB and BB can be beneficially duplicated into
3373// them.
3374void MachineBlockPlacement::findDuplicateCandidates(
3375 SmallVectorImpl<MachineBasicBlock *> &Candidates, MachineBasicBlock *BB,
3376 BlockFilterSet *BlockFilter) {
3377 MachineBasicBlock *Fallthrough = nullptr;
3378 BranchProbability DefaultBranchProb = BranchProbability::getZero();
3379 BlockFrequency BBDupThreshold(scaleThreshold(BB));
3380 SmallVector<MachineBasicBlock *, 8> Preds(BB->predecessors());
3381 SmallVector<MachineBasicBlock *, 8> Succs(BB->successors());
3382
3383 // Sort for highest frequency.
3384 auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3385 return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
3386 };
3387 auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3388 return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
3389 };
3390 llvm::stable_sort(Succs, CmpSucc);
3391 llvm::stable_sort(Preds, CmpPred);
3392
3393 auto SuccIt = Succs.begin();
3394 if (SuccIt != Succs.end()) {
3395 DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
3396 }
3397
3398 // For each predecessors of BB, compute the benefit of duplicating BB,
3399 // if it is larger than the threshold, add it into Candidates.
3400 //
3401 // If we have following control flow.
3402 //
3403 // PB1 PB2 PB3 PB4
3404 // \ | / /\
3405 // \ | / / \
3406 // \ |/ / \
3407 // BB----/ OB
3408 // /\
3409 // / \
3410 // SB1 SB2
3411 //
3412 // And it can be partially duplicated as
3413 //
3414 // PB2+BB
3415 // | PB1 PB3 PB4
3416 // | | / /\
3417 // | | / / \
3418 // | |/ / \
3419 // | BB----/ OB
3420 // |\ /|
3421 // | X |
3422 // |/ \|
3423 // SB2 SB1
3424 //
3425 // The benefit of duplicating into a predecessor is defined as
3426 // Orig_taken_branch - Duplicated_taken_branch
3427 //
3428 // The Orig_taken_branch is computed with the assumption that predecessor
3429 // jumps to BB and the most possible successor is laid out after BB.
3430 //
3431 // The Duplicated_taken_branch is computed with the assumption that BB is
3432 // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3433 // SB2 for PB2 in our case). If there is no available successor, the combined
3434 // block jumps to all BB's successor, like PB3 in this example.
3435 //
3436 // If a predecessor has multiple successors, so BB can't be duplicated into
3437 // it. But it can beneficially fall through to BB, and duplicate BB into other
3438 // predecessors.
3439 for (MachineBasicBlock *Pred : Preds) {
3440 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3441
3442 if (!TailDup.canTailDuplicate(BB, Pred)) {
3443 // BB can't be duplicated into Pred, but it is possible to be layout
3444 // below Pred.
3445 if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3446 Fallthrough = Pred;
3447 if (SuccIt != Succs.end())
3448 SuccIt++;
3449 }
3450 continue;
3451 }
3452
3453 BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3454 BlockFrequency DupCost;
3455 if (SuccIt == Succs.end()) {
3456 // Jump to all successors;
3457 if (Succs.size() > 0)
3458 DupCost += PredFreq;
3459 } else {
3460 // Fallthrough to *SuccIt, jump to all other successors;
3461 DupCost += PredFreq;
3462 DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
3463 }
3464
3465 assert(OrigCost >= DupCost);
3466 OrigCost -= DupCost;
3467 if (OrigCost > BBDupThreshold) {
3468 Candidates.push_back(Pred);
3469 if (SuccIt != Succs.end())
3470 SuccIt++;
3471 }
3472 }
3473
3474 // No predecessors can optimally fallthrough to BB.
3475 // So we can change one duplication into fallthrough.
3476 if (!Fallthrough) {
3477 if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
3478 Candidates[0] = Candidates.back();
3479 Candidates.pop_back();
3480 }
3481 }
3482}
3483
3484void MachineBlockPlacement::initTailDupThreshold() {
3485 DupThreshold = BlockFrequency(0);
3486 if (F->getFunction().hasProfileData()) {
3487 // We prefer to use prifile count.
3488 uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
3489 if (HotThreshold != UINT64_MAX) {
3490 UseProfileCount = true;
3491 DupThreshold =
3492 BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100);
3493 } else {
3494 // Profile count is not available, we can use block frequency instead.
3495 BlockFrequency MaxFreq = BlockFrequency(0);
3496 for (MachineBasicBlock &MBB : *F) {
3497 BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
3498 if (Freq > MaxFreq)
3499 MaxFreq = Freq;
3500 }
3501
3502 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
3503 DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
3504 UseProfileCount = false;
3505 }
3506 }
3507
3508 TailDupSize = TailDupPlacementThreshold;
3509 // If only the aggressive threshold is explicitly set, use it.
3510 if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3511 TailDupPlacementThreshold.getNumOccurrences() == 0)
3513
3514 // For aggressive optimization, we can adjust some thresholds to be less
3515 // conservative.
3516 if (OptLevel >= CodeGenOptLevel::Aggressive) {
3517 // At O3 we should be more willing to copy blocks for tail duplication. This
3518 // increases size pressure, so we only do it at O3
3519 // Do this unless only the regular threshold is explicitly set.
3520 if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3521 TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3523 }
3524
3525 // If there's no threshold provided through options, query the target
3526 // information for a threshold instead.
3527 if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
3528 (OptLevel < CodeGenOptLevel::Aggressive ||
3529 TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
3530 TailDupSize = TII->getTailDuplicateSize(OptLevel);
3531}
3532
3533PreservedAnalyses
3536 auto *MBPI = &MFAM.getResult<MachineBranchProbabilityAnalysis>(MF);
3537 auto MBFI = std::make_unique<MBFIWrapper>(
3539 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
3540 auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF)
3542 : nullptr;
3544 .getCachedResult<ProfileSummaryAnalysis>(
3545 *MF.getFunction().getParent());
3546 if (!PSI)
3547 report_fatal_error("MachineBlockPlacement requires ProfileSummaryAnalysis",
3548 false);
3549
3550 MachineBlockPlacement MBP(MBPI, MLI, PSI, std::move(MBFI), MPDT,
3551 AllowTailMerge);
3552
3553 if (!MBP.run(MF))
3554 return PreservedAnalyses::all();
3555
3557}
3558
3560 raw_ostream &OS,
3561 function_ref<StringRef(StringRef)> MapClassName2PassName) const {
3562 OS << MapClassName2PassName(name());
3563 if (!AllowTailMerge)
3564 OS << "<no-tail-merge>";
3565}
3566
3567bool MachineBlockPlacement::run(MachineFunction &MF) {
3568
3569 // Check for single-block functions and skip them.
3570 if (std::next(MF.begin()) == MF.end())
3571 return false;
3572
3573 F = &MF;
3574 OptLevel = F->getTarget().getOptLevel();
3575
3576 TII = MF.getSubtarget().getInstrInfo();
3577 TLI = MF.getSubtarget().getTargetLowering();
3578
3579 // Initialize PreferredLoopExit to nullptr here since it may never be set if
3580 // there are no MachineLoops.
3581 PreferredLoopExit = nullptr;
3582
3583 assert(BlockToChain.empty() &&
3584 "BlockToChain map should be empty before starting placement.");
3585 assert(ComputedEdges.empty() &&
3586 "Computed Edge map should be empty before starting placement.");
3587
3588 // Initialize tail duplication thresholds.
3589 initTailDupThreshold();
3590
3591 const bool OptForSize =
3592 llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
3593 // Determine whether to use ext-tsp for perf/size optimization. The method
3594 // is beneficial only for instances with at least 3 basic blocks and it can be
3595 // disabled for huge functions (exceeding a certain size).
3596 bool UseExtTspForPerf = false;
3597 bool UseExtTspForSize = false;
3598 if (3 <= MF.size() && MF.size() <= ExtTspBlockPlacementMaxBlocks) {
3599 UseExtTspForPerf =
3602 UseExtTspForSize = OptForSize && ApplyExtTspForSize;
3603 }
3604
3605 // Apply tail duplication.
3606 if (allowTailDupPlacement(*F)) {
3607 if (OptForSize)
3608 TailDupSize = 1;
3609 const bool PreRegAlloc = false;
3610 TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
3611 /* LayoutMode */ true, TailDupSize);
3612 if (!UseExtTspForSize)
3613 precomputeTriangleChains();
3614 }
3615
3616 // Run the main block placement.
3617 if (!UseExtTspForSize)
3618 buildCFGChains();
3619
3620 // Changing the layout can create new tail merging opportunities.
3621 // TailMerge can create jump into if branches that make CFG irreducible for
3622 // HW that requires structured CFG.
3623 const bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3624 AllowTailMerge && BranchFoldPlacement &&
3625 MF.size() > 3;
3626 // No tail merging opportunities if the block number is less than four.
3627 if (EnableTailMerge) {
3628 const unsigned TailMergeSize = TailDupSize + 1;
3629 BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
3630 *MBFI, *MBPI, PSI, TailMergeSize);
3631
3632 if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
3633 /*AfterPlacement=*/true)) {
3634 // Must redo the post-dominator tree if blocks were changed.
3635 if (MPDT)
3636 MPDT->recalculate(MF);
3637 if (!UseExtTspForSize) {
3638 // Redo the layout if tail merging creates/removes/moves blocks.
3639 BlockToChain.clear();
3640 ComputedEdges.clear();
3641 ChainAllocator.DestroyAll();
3642 buildCFGChains();
3643 }
3644 }
3645 }
3646
3647 // Apply a post-processing optimizing block placement:
3648 // - find a new placement and modify the layout of the blocks in the function;
3649 // - re-create CFG chains so that we can optimizeBranches and alignBlocks.
3650 if (UseExtTspForPerf || UseExtTspForSize) {
3651 assert(
3652 !(UseExtTspForPerf && UseExtTspForSize) &&
3653 "UseExtTspForPerf and UseExtTspForSize can not be set simultaneously");
3654 applyExtTsp(/*OptForSize=*/UseExtTspForSize);
3655 createCFGChainExtTsp();
3656 }
3657
3658 optimizeBranches();
3659 alignBlocks();
3660
3661 BlockToChain.clear();
3662 ComputedEdges.clear();
3663 ChainAllocator.DestroyAll();
3664
3665 // View the function.
3667 (ViewBlockFreqFuncName.empty() ||
3668 F->getFunction().getName() == ViewBlockFreqFuncName)) {
3670 MF.RenumberBlocks();
3671 MBFI->view("MBP." + MF.getName(), false);
3672 }
3673
3674 // We always return true as we have no way to track whether the final order
3675 // differs from the original order.
3676 return true;
3677}
3678
3679void MachineBlockPlacement::applyExtTsp(bool OptForSize) {
3680 // Prepare data; blocks are indexed by their index in the current ordering.
3681 DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex;
3682 BlockIndex.reserve(F->size());
3683 std::vector<const MachineBasicBlock *> CurrentBlockOrder;
3684 CurrentBlockOrder.reserve(F->size());
3685 size_t NumBlocks = 0;
3686 for (const MachineBasicBlock &MBB : *F) {
3687 BlockIndex[&MBB] = NumBlocks++;
3688 CurrentBlockOrder.push_back(&MBB);
3689 }
3690
3691 SmallVector<uint64_t, 0> BlockCounts(F->size());
3692 SmallVector<uint64_t, 0> BlockSizes(F->size());
3694 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
3696 for (MachineBasicBlock &MBB : *F) {
3697 // Getting the block frequency.
3698 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3699 BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency();
3700 // Getting the block size:
3701 // - approximate the size of an instruction by 4 bytes, and
3702 // - ignore debug instructions.
3703 // Note: getting the exact size of each block is target-dependent and can be
3704 // done by extending the interface of MCCodeEmitter. Experimentally we do
3705 // not see a perf improvement with the exact block sizes.
3706 auto NonDbgInsts =
3708 size_t NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
3709 BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3710
3711 // Getting jump frequencies.
3712 if (OptForSize) {
3713 Cond.clear();
3714 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
3715 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3716 continue;
3717
3718 const MachineBasicBlock *FTB = MBB.getFallThrough();
3719 // Succs is a collection of distinct destinations of the block reachable
3720 // from MBB via a jump instruction; initialize the list using the three
3721 // (non-necessarily distinct) blocks, FTB, TBB, and FBB.
3722 Succs.clear();
3723 if (TBB && TBB != FTB)
3724 Succs.push_back(TBB);
3725 if (FBB && FBB != FTB)
3726 Succs.push_back(FBB);
3727 if (FTB)
3728 Succs.push_back(FTB);
3729 // Absolute magnitude of non-zero counts does not matter for the
3730 // optimization; prioritize slightly jumps with a single successor, since
3731 // the corresponding jump instruction will be removed from the binary.
3732 const uint64_t Freq = Succs.size() == 1 ? 110 : 100;
3733 for (const MachineBasicBlock *Succ : Succs)
3734 JumpCounts.push_back({BlockIndex[&MBB], BlockIndex[Succ], Freq});
3735 } else {
3736 for (MachineBasicBlock *Succ : MBB.successors()) {
3737 auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3738 BlockFrequency JumpFreq = BlockFreq * EP;
3739 JumpCounts.push_back(
3740 {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
3741 }
3742 }
3743 }
3744
3745 LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
3746 << " with profile = " << F->getFunction().hasProfileData()
3747 << " (" << F->getName() << ")" << "\n");
3748
3749 const double OrgScore = calcExtTspScore(BlockSizes, JumpCounts);
3750 LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", OrgScore));
3751
3752 // Run the layout algorithm.
3753 auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
3754 std::vector<const MachineBasicBlock *> NewBlockOrder;
3755 NewBlockOrder.reserve(F->size());
3756 for (uint64_t Node : NewOrder) {
3757 NewBlockOrder.push_back(CurrentBlockOrder[Node]);
3758 }
3759 const double OptScore = calcExtTspScore(NewOrder, BlockSizes, JumpCounts);
3760 LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", OptScore));
3761
3762 // If the optimization is unsuccessful, fall back to the original block order.
3763 if (OptForSize && OrgScore > OptScore)
3764 assignBlockOrder(CurrentBlockOrder);
3765 else
3766 assignBlockOrder(NewBlockOrder);
3767}
3768
3769void MachineBlockPlacement::assignBlockOrder(
3770 const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
3771 assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
3772 F->RenumberBlocks();
3773 // At this point, we possibly removed blocks from the function, so we can't
3774 // renumber the domtree. At this point, we don't need it anymore, though.
3775 // TODO: move this to the point where the dominator tree is actually
3776 // invalidated (i.e., where blocks are removed without updating the domtree).
3777 MPDT = nullptr;
3778
3779 bool HasChanges = false;
3780 for (size_t I = 0; I < NewBlockOrder.size(); I++) {
3781 if (NewBlockOrder[I] != F->getBlockNumbered(I)) {
3782 HasChanges = true;
3783 break;
3784 }
3785 }
3786 // Stop early if the new block order is identical to the existing one.
3787 if (!HasChanges)
3788 return;
3789
3790 SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs());
3791 for (auto &MBB : *F) {
3792 PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
3793 }
3794
3795 // Sort basic blocks in the function according to the computed order.
3796 DenseMap<const MachineBasicBlock *, size_t> NewIndex;
3797 for (const MachineBasicBlock *MBB : NewBlockOrder) {
3798 NewIndex[MBB] = NewIndex.size();
3799 }
3800 F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) {
3801 return NewIndex[&L] < NewIndex[&R];
3802 });
3803
3804 // Update basic block branches by inserting explicit fallthrough branches
3805 // when required and re-optimize branches when possible.
3806 const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
3808 for (auto &MBB : *F) {
3809 MachineFunction::iterator NextMBB = std::next(MBB.getIterator());
3811 auto *FTMBB = PrevFallThroughs[MBB.getNumber()];
3812 // If this block had a fallthrough before we need an explicit unconditional
3813 // branch to that block if the fallthrough block is not adjacent to the
3814 // block in the new order.
3815 if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) {
3816 TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
3817 }
3818
3819 // It might be possible to optimize branches by flipping the condition.
3820 Cond.clear();
3821 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
3822 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3823 continue;
3824 MBB.updateTerminator(FTMBB);
3825 }
3826}
3827
3828void MachineBlockPlacement::createCFGChainExtTsp() {
3829 BlockToChain.clear();
3830 ComputedEdges.clear();
3831 ChainAllocator.DestroyAll();
3832
3833 MachineBasicBlock *HeadBB = &F->front();
3834 BlockChain *FunctionChain =
3835 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB);
3836
3837 for (MachineBasicBlock &MBB : *F) {
3838 if (HeadBB == &MBB)
3839 continue; // Ignore head of the chain
3840 FunctionChain->merge(&MBB, nullptr);
3841 }
3842}
3843
3844namespace {
3845
3846/// A pass to compute block placement statistics.
3847///
3848/// A separate pass to compute interesting statistics for evaluating block
3849/// placement. This is separate from the actual placement pass so that they can
3850/// be computed in the absence of any placement transformations or when using
3851/// alternative placement strategies.
3852class MachineBlockPlacementStats {
3853 /// A handle to the branch probability pass.
3854 const MachineBranchProbabilityInfo *MBPI;
3855
3856 /// A handle to the function-wide block frequency pass.
3857 const MachineBlockFrequencyInfo *MBFI;
3858
3859public:
3860 MachineBlockPlacementStats(const MachineBranchProbabilityInfo *MBPI,
3861 const MachineBlockFrequencyInfo *MBFI)
3862 : MBPI(MBPI), MBFI(MBFI) {}
3863 bool run(MachineFunction &MF);
3864};
3865
3866class MachineBlockPlacementStatsLegacy : public MachineFunctionPass {
3867public:
3868 static char ID; // Pass identification, replacement for typeid
3869
3870 MachineBlockPlacementStatsLegacy() : MachineFunctionPass(ID) {
3873 }
3874
3875 bool runOnMachineFunction(MachineFunction &F) override {
3876 auto *MBPI =
3877 &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
3878 auto *MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
3879 return MachineBlockPlacementStats(MBPI, MBFI).run(F);
3880 }
3881
3882 void getAnalysisUsage(AnalysisUsage &AU) const override {
3883 AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
3884 AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
3885 AU.setPreservesAll();
3887 }
3888};
3889
3890} // end anonymous namespace
3891
3892char MachineBlockPlacementStatsLegacy::ID = 0;
3893
3894char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStatsLegacy::ID;
3895
3896INITIALIZE_PASS_BEGIN(MachineBlockPlacementStatsLegacy, "block-placement-stats",
3897 "Basic Block Placement Stats", false, false)
3900INITIALIZE_PASS_END(MachineBlockPlacementStatsLegacy, "block-placement-stats",
3901 "Basic Block Placement Stats", false, false)
3902
3906 auto &MBPI = MFAM.getResult<MachineBranchProbabilityAnalysis>(MF);
3907 auto &MBFI = MFAM.getResult<MachineBlockFrequencyAnalysis>(MF);
3908
3909 MachineBlockPlacementStats(&MBPI, &MBFI).run(MF);
3910 return PreservedAnalyses::all();
3911}
3912
3913bool MachineBlockPlacementStats::run(MachineFunction &F) {
3914 // Check for single-block functions and skip them.
3915 if (std::next(F.begin()) == F.end())
3916 return false;
3917
3918 if (!isFunctionInPrintList(F.getName()))
3919 return false;
3920
3921 for (MachineBasicBlock &MBB : F) {
3922 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3923 Statistic &NumBranches =
3924 (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3925 Statistic &BranchTakenFreq =
3926 (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3927 for (MachineBasicBlock *Succ : MBB.successors()) {
3928 // Skip if this successor is a fallthrough.
3929 if (MBB.isLayoutSuccessor(Succ))
3930 continue;
3931
3932 BlockFrequency EdgeFreq =
3933 BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3934 ++NumBranches;
3935 BranchTakenFreq += EdgeFreq.getFrequency();
3936 }
3937 }
3938
3939 return false;
3940}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
This file defines the BumpPtrAllocator interface.
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares methods and data structures for code layout algorithms.
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
static unsigned InstrCount
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > ExtTspBlockPlacementMaxBlocks("ext-tsp-block-placement-max-blocks", cl::desc("Maximum number of basic blocks in a function to run ext-TSP " "block placement."), cl::init(UINT_MAX), cl::Hidden)
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > PredecessorLimit("block-placement-predecessor-limit", cl::desc("For blocks with more predecessors, certain layout optimizations" "will be disabled to prevent quadratic compile time."), cl::init(1000), cl::Hidden)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, BlockFrequency EntryFreq)
Compare 2 BlockFrequency's with a small penalty for A.
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > MaxBytesForAlignmentOverride("max-bytes-for-alignment", cl::desc("Forces the maximum bytes allowed to be emitted when padding for " "alignment"), cl::init(0), cl::Hidden)
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static cl::opt< unsigned > TailDupProfilePercentThreshold("tail-dup-profile-percent-threshold", cl::desc("If profile count information is used in tail duplication cost " "model, the gained fall through number from tail duplication " "should be at least this percent of hot count."), cl::init(50), cl::Hidden)
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
static cl::opt< bool > ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden, cl::desc("Use ext-tsp for size-aware block placement."))
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock * > &Successors)
Check if BB has exactly the successors in Successors.
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunities in " "outline branches."), cl::init(true), cl::Hidden)
static uint64_t countMBBInstruction(MachineBasicBlock *MBB)
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
static cl::opt< bool > RenumberBlocksBeforeView("renumber-blocks-before-view", cl::desc("If true, basic blocks are re-numbered before MBP layout is printed " "into a dot graph. Only used when a function is being printed."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
#define P(N)
static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
static const char * name
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
static BlockFrequency max()
Returns the maximum possible frequency, the saturation value.
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
static BranchProbability getOne()
uint32_t getNumerator() const
BranchProbability getCompl() const
static BranchProbability getZero()
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:114
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition Function.h:334
Module * getParent()
Get the module that this global value is contained inside of...
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
Remove the branching code at the end of the specific MBB.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
Reverses the branch condition of the specified condition list, returning false on success and true if...
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
Insert branch code into the end of the specified MachineBasicBlock.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
LLVM_ABI StringRef getString() const
Definition Metadata.cpp:624
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
LLVM_ABI void updateTerminator(MachineBasicBlock *PreviousLayoutSuccessor)
Update the terminator instructions in block to account for changes to block layout which may have bee...
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
void setAlignment(Align A)
Set alignment of the basic block.
LLVM_ABI bool isEntryBlock() const
Returns true if this is the entry block of the function.
succ_reverse_iterator succ_rbegin()
LLVM_ABI bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
LLVM_ABI DebugLoc findBranchDebugLoc()
Find and return the merged DebugLoc of the branch instructions of the block.
iterator_range< succ_iterator > successors()
LLVM_ABI bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
iterator_range< pred_iterator > predecessors()
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
LLVM_ABI Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
unsigned size() const
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Representation of each machine instruction.
Analysis pass that exposes the MachineLoopInfo for a machine function.
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
LLVM_ABI uint64_t getOrCompHotCountThreshold() const
Returns HotCountThreshold if set.
typename vector_type::const_iterator iterator
Definition SetVector.h:72
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
reference emplace_back(ArgTypes &&... Args)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition Allocator.h:440
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition Allocator.h:411
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, MBFIWrapper *MBFI, ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock * > *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr, SmallVectorImpl< MachineBasicBlock * > *CandidatePtr=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
bool requiresStructuredCFG() const
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
virtual const TargetLowering * getTargetLowering() const
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an std::string.
#define UINT64_MAX
Definition DataTypes.h:77
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
LLVM_ABI double calcExtTspScore(ArrayRef< uint64_t > Order, ArrayRef< uint64_t > NodeSizes, ArrayRef< EdgeCount > EdgeCounts)
Estimate the "quality" of a given node order in CFG.
LLVM_ABI std::vector< uint64_t > computeExtTspLayout(ArrayRef< uint64_t > NodeSizes, ArrayRef< uint64_t > NodeCounts, ArrayRef< EdgeCount > EdgeCounts)
Find a layout of nodes (basic blocks) of a given CFG optimizing jump locality and thus processor I-ca...
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:667
LLVM_ABI void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition Path.cpp:457
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI void initializeMachineBlockPlacementStatsLegacyPass(PassRegistry &)
void stable_sort(R &&Range)
Definition STLExtras.h:2106
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
cl::opt< bool > ApplyExtTspWithoutProfile
constexpr from_range_t from_range
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
cl::opt< unsigned > ProfileLikelyProb
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2190
cl::opt< std::string > ViewBlockFreqFuncName("view-bfi-func-name", cl::Hidden, cl::desc("The option to specify " "the name of the function " "whose CFG will be displayed."))
NoopStatistic Statistic
Definition Statistic.h:162
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
cl::opt< GVDAGType > ViewBlockLayoutWithBFI("view-block-layout-with-bfi", cl::Hidden, cl::desc("Pop up a window to show a dag displaying MBP layout and associated " "block frequencies of the CFG."), cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), clEnumValN(GVDT_Fraction, "fraction", "display a graph using the " "fractional block frequency representation."), clEnumValN(GVDT_Integer, "integer", "display a graph using the raw " "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available.")))
bool isFunctionInPrintList(StringRef FunctionName)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
LLVM_ABI void initializeMachineBlockPlacementLegacyPass(PassRegistry &)
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1915
cl::opt< unsigned > StaticLikelyProb
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2182
LLVM_ABI Printable printBlockFreq(const BlockFrequencyInfo &BFI, BlockFrequency Freq)
Print the block frequency Freq relative to the current functions entry frequency.
LLVM_ABI char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
cl::opt< bool > EnableExtTspBlockPlacement
LLVM_ABI char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872