LLVM  10.0.0svn
LoopFuse.cpp
Go to the documentation of this file.
1 //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements the loop fusion pass.
11 /// The implementation is largely based on the following document:
12 ///
13 /// Code Transformations to Augment the Scope of Loop Fusion in a
14 /// Production Compiler
15 /// Christopher Mark Barton
16 /// MSc Thesis
17 /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf
18 ///
19 /// The general approach taken is to collect sets of control flow equivalent
20 /// loops and test whether they can be fused. The necessary conditions for
21 /// fusion are:
22 /// 1. The loops must be adjacent (there cannot be any statements between
23 /// the two loops).
24 /// 2. The loops must be conforming (they must execute the same number of
25 /// iterations).
26 /// 3. The loops must be control flow equivalent (if one loop executes, the
27 /// other is guaranteed to execute).
28 /// 4. There cannot be any negative distance dependencies between the loops.
29 /// If all of these conditions are satisfied, it is safe to fuse the loops.
30 ///
31 /// This implementation creates FusionCandidates that represent the loop and the
32 /// necessary information needed by fusion. It then operates on the fusion
33 /// candidates, first confirming that the candidate is eligible for fusion. The
34 /// candidates are then collected into control flow equivalent sets, sorted in
35 /// dominance order. Each set of control flow equivalent candidates is then
36 /// traversed, attempting to fuse pairs of candidates in the set. If all
37 /// requirements for fusion are met, the two candidates are fused, creating a
38 /// new (fused) candidate which is then added back into the set to consider for
39 /// additional fusion.
40 ///
41 /// This implementation currently does not make any modifications to remove
42 /// conditions for fusion. Code transformations to make loops conform to each of
43 /// the conditions for fusion are discussed in more detail in the document
44 /// above. These can be added to the current implementation in the future.
45 //===----------------------------------------------------------------------===//
46 
48 #include "llvm/ADT/Statistic.h"
51 #include "llvm/Analysis/LoopInfo.h"
56 #include "llvm/IR/Function.h"
57 #include "llvm/IR/Verifier.h"
58 #include "llvm/Pass.h"
59 #include "llvm/Support/Debug.h"
61 #include "llvm/Transforms/Scalar.h"
62 #include "llvm/Transforms/Utils.h"
64 
65 using namespace llvm;
66 
67 #define DEBUG_TYPE "loop-fusion"
68 
69 STATISTIC(FuseCounter, "Loops fused");
70 STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");
71 STATISTIC(InvalidPreheader, "Loop has invalid preheader");
72 STATISTIC(InvalidHeader, "Loop has invalid header");
73 STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks");
74 STATISTIC(InvalidExitBlock, "Loop has invalid exit block");
75 STATISTIC(InvalidLatch, "Loop has invalid latch");
76 STATISTIC(InvalidLoop, "Loop is invalid");
77 STATISTIC(AddressTakenBB, "Basic block has address taken");
78 STATISTIC(MayThrowException, "Loop may throw an exception");
79 STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");
80 STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");
81 STATISTIC(InvalidDependencies, "Dependencies prevent fusion");
82 STATISTIC(UnknownTripCount, "Loop has unknown trip count");
83 STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
84 STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
85 STATISTIC(NonAdjacent, "Loops are not adjacent");
86 STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader");
87 STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
88 STATISTIC(NonIdenticalGuards, "Candidates have different guards");
89 STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block");
90 STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block");
91 
96 };
97 
99  "loop-fusion-dependence-analysis",
100  cl::desc("Which dependence analysis should loop fusion use?"),
102  "Use the scalar evolution interface"),
104  "Use the dependence analysis interface"),
106  "Use all available analyses")),
108 
109 #ifndef NDEBUG
110 static cl::opt<bool>
111  VerboseFusionDebugging("loop-fusion-verbose-debug",
112  cl::desc("Enable verbose debugging for Loop Fusion"),
114 #endif
115 
116 namespace {
117 /// This class is used to represent a candidate for loop fusion. When it is
118 /// constructed, it checks the conditions for loop fusion to ensure that it
119 /// represents a valid candidate. It caches several parts of a loop that are
120 /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead
121 /// of continually querying the underlying Loop to retrieve these values. It is
122 /// assumed these will not change throughout loop fusion.
123 ///
124 /// The invalidate method should be used to indicate that the FusionCandidate is
125 /// no longer a valid candidate for fusion. Similarly, the isValid() method can
126 /// be used to ensure that the FusionCandidate is still valid for fusion.
127 struct FusionCandidate {
128  /// Cache of parts of the loop used throughout loop fusion. These should not
129  /// need to change throughout the analysis and transformation.
130  /// These parts are cached to avoid repeatedly looking up in the Loop class.
131 
132  /// Preheader of the loop this candidate represents
133  BasicBlock *Preheader;
134  /// Header of the loop this candidate represents
135  BasicBlock *Header;
136  /// Blocks in the loop that exit the loop
137  BasicBlock *ExitingBlock;
138  /// The successor block of this loop (where the exiting blocks go to)
139  BasicBlock *ExitBlock;
140  /// Latch of the loop
141  BasicBlock *Latch;
142  /// The loop that this fusion candidate represents
143  Loop *L;
144  /// Vector of instructions in this loop that read from memory
146  /// Vector of instructions in this loop that write to memory
148  /// Are all of the members of this fusion candidate still valid
149  bool Valid;
150  /// Guard branch of the loop, if it exists
151  BranchInst *GuardBranch;
152 
153  /// Dominator and PostDominator trees are needed for the
154  /// FusionCandidateCompare function, required by FusionCandidateSet to
155  /// determine where the FusionCandidate should be inserted into the set. These
156  /// are used to establish ordering of the FusionCandidates based on dominance.
157  const DominatorTree *DT;
158  const PostDominatorTree *PDT;
159 
161 
162  FusionCandidate(Loop *L, const DominatorTree *DT,
164  : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
165  ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
166  Latch(L->getLoopLatch()), L(L), Valid(true), GuardBranch(nullptr),
167  DT(DT), PDT(PDT), ORE(ORE) {
168 
169  // TODO: This is temporary while we fuse both rotated and non-rotated
170  // loops. Once we switch to only fusing rotated loops, the initialization of
171  // GuardBranch can be moved into the initialization list above.
172  if (isRotated())
173  GuardBranch = L->getLoopGuardBranch();
174 
175  // Walk over all blocks in the loop and check for conditions that may
176  // prevent fusion. For each block, walk over all instructions and collect
177  // the memory reads and writes If any instructions that prevent fusion are
178  // found, invalidate this object and return.
179  for (BasicBlock *BB : L->blocks()) {
180  if (BB->hasAddressTaken()) {
181  invalidate();
182  reportInvalidCandidate(AddressTakenBB);
183  return;
184  }
185 
186  for (Instruction &I : *BB) {
187  if (I.mayThrow()) {
188  invalidate();
189  reportInvalidCandidate(MayThrowException);
190  return;
191  }
192  if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
193  if (SI->isVolatile()) {
194  invalidate();
195  reportInvalidCandidate(ContainsVolatileAccess);
196  return;
197  }
198  }
199  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
200  if (LI->isVolatile()) {
201  invalidate();
202  reportInvalidCandidate(ContainsVolatileAccess);
203  return;
204  }
205  }
206  if (I.mayWriteToMemory())
207  MemWrites.push_back(&I);
208  if (I.mayReadFromMemory())
209  MemReads.push_back(&I);
210  }
211  }
212  }
213 
214  /// Check if all members of the class are valid.
215  bool isValid() const {
216  return Preheader && Header && ExitingBlock && ExitBlock && Latch && L &&
217  !L->isInvalid() && Valid;
218  }
219 
220  /// Verify that all members are in sync with the Loop object.
221  void verify() const {
222  assert(isValid() && "Candidate is not valid!!");
223  assert(!L->isInvalid() && "Loop is invalid!");
224  assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync");
225  assert(Header == L->getHeader() && "Header is out of sync");
226  assert(ExitingBlock == L->getExitingBlock() &&
227  "Exiting Blocks is out of sync");
228  assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync");
229  assert(Latch == L->getLoopLatch() && "Latch is out of sync");
230  }
231 
232  /// Get the entry block for this fusion candidate.
233  ///
234  /// If this fusion candidate represents a guarded loop, the entry block is the
235  /// loop guard block. If it represents an unguarded loop, the entry block is
236  /// the preheader of the loop.
237  BasicBlock *getEntryBlock() const {
238  if (GuardBranch)
239  return GuardBranch->getParent();
240  else
241  return Preheader;
242  }
243 
244  /// Given a guarded loop, get the successor of the guard that is not in the
245  /// loop.
246  ///
247  /// This method returns the successor of the loop guard that is not located
248  /// within the loop (i.e., the successor of the guard that is not the
249  /// preheader).
250  /// This method is only valid for guarded loops.
251  BasicBlock *getNonLoopBlock() const {
252  assert(GuardBranch && "Only valid on guarded loops.");
253  assert(GuardBranch->isConditional() &&
254  "Expecting guard to be a conditional branch.");
255  return (GuardBranch->getSuccessor(0) == Preheader)
256  ? GuardBranch->getSuccessor(1)
257  : GuardBranch->getSuccessor(0);
258  }
259 
260  bool isRotated() const {
261  assert(L && "Expecting loop to be valid.");
262  assert(Latch && "Expecting latch to be valid.");
263  return L->isLoopExiting(Latch);
264  }
265 
266 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
267  LLVM_DUMP_METHOD void dump() const {
268  dbgs() << "\tGuardBranch: "
269  << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"
270  << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
271  << "\n"
272  << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
273  << "\tExitingBB: "
274  << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
275  << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
276  << "\n"
277  << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"
278  << "\tEntryBlock: "
279  << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
280  << "\n";
281  }
282 #endif
283 
284  /// Determine if a fusion candidate (representing a loop) is eligible for
285  /// fusion. Note that this only checks whether a single loop can be fused - it
286  /// does not check whether it is *legal* to fuse two loops together.
287  bool isEligibleForFusion(ScalarEvolution &SE) const {
288  if (!isValid()) {
289  LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
290  if (!Preheader)
291  ++InvalidPreheader;
292  if (!Header)
293  ++InvalidHeader;
294  if (!ExitingBlock)
295  ++InvalidExitingBlock;
296  if (!ExitBlock)
297  ++InvalidExitBlock;
298  if (!Latch)
299  ++InvalidLatch;
300  if (L->isInvalid())
301  ++InvalidLoop;
302 
303  return false;
304  }
305 
306  // Require ScalarEvolution to be able to determine a trip count.
308  LLVM_DEBUG(dbgs() << "Loop " << L->getName()
309  << " trip count not computable!\n");
310  return reportInvalidCandidate(UnknownTripCount);
311  }
312 
313  if (!L->isLoopSimplifyForm()) {
314  LLVM_DEBUG(dbgs() << "Loop " << L->getName()
315  << " is not in simplified form!\n");
316  return reportInvalidCandidate(NotSimplifiedForm);
317  }
318 
319  return true;
320  }
321 
322 private:
323  // This is only used internally for now, to clear the MemWrites and MemReads
324  // list and setting Valid to false. I can't envision other uses of this right
325  // now, since once FusionCandidates are put into the FusionCandidateSet they
326  // are immutable. Thus, any time we need to change/update a FusionCandidate,
327  // we must create a new one and insert it into the FusionCandidateSet to
328  // ensure the FusionCandidateSet remains ordered correctly.
329  void invalidate() {
330  MemWrites.clear();
331  MemReads.clear();
332  Valid = false;
333  }
334 
335  bool reportInvalidCandidate(llvm::Statistic &Stat) const {
336  using namespace ore;
337  assert(L && Preheader && "Fusion candidate not initialized properly!");
338  ++Stat;
340  L->getStartLoc(), Preheader)
341  << "[" << Preheader->getParent()->getName() << "]: "
342  << "Loop is not a candidate for fusion: " << Stat.getDesc());
343  return false;
344  }
345 };
346 
347 struct FusionCandidateCompare {
348  /// Comparison functor to sort two Control Flow Equivalent fusion candidates
349  /// into dominance order.
350  /// If LHS dominates RHS and RHS post-dominates LHS, return true;
351  /// IF RHS dominates LHS and LHS post-dominates RHS, return false;
352  bool operator()(const FusionCandidate &LHS,
353  const FusionCandidate &RHS) const {
354  const DominatorTree *DT = LHS.DT;
355 
356  BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
357  BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
358 
359  // Do not save PDT to local variable as it is only used in asserts and thus
360  // will trigger an unused variable warning if building without asserts.
361  assert(DT && LHS.PDT && "Expecting valid dominator tree");
362 
363  // Do this compare first so if LHS == RHS, function returns false.
364  if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
365  // RHS dominates LHS
366  // Verify LHS post-dominates RHS
367  assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
368  return false;
369  }
370 
371  if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
372  // Verify RHS Postdominates LHS
373  assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
374  return true;
375  }
376 
377  // If LHS does not dominate RHS and RHS does not dominate LHS then there is
378  // no dominance relationship between the two FusionCandidates. Thus, they
379  // should not be in the same set together.
381  "No dominance relationship between these fusion candidates!");
382  }
383 };
384 
385 using LoopVector = SmallVector<Loop *, 4>;
386 
387 // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance
388 // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0
389 // dominates FC1 and FC1 post-dominates FC0.
390 // std::set was chosen because we want a sorted data structure with stable
391 // iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent
392 // loops by moving intervening code around. When this intervening code contains
393 // loops, those loops will be moved also. The corresponding FusionCandidates
394 // will also need to be moved accordingly. As this is done, having stable
395 // iterators will simplify the logic. Similarly, having an efficient insert that
396 // keeps the FusionCandidateSet sorted will also simplify the implementation.
397 using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
398 using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
399 
400 #if !defined(NDEBUG)
402  const FusionCandidate &FC) {
403  if (FC.isValid())
404  OS << FC.Preheader->getName();
405  else
406  OS << "<Invalid>";
407 
408  return OS;
409 }
410 
412  const FusionCandidateSet &CandSet) {
413  for (const FusionCandidate &FC : CandSet)
414  OS << FC << '\n';
415 
416  return OS;
417 }
418 
419 static void
420 printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
421  dbgs() << "Fusion Candidates: \n";
422  for (const auto &CandidateSet : FusionCandidates) {
423  dbgs() << "*** Fusion Candidate Set ***\n";
424  dbgs() << CandidateSet;
425  dbgs() << "****************************\n";
426  }
427 }
428 #endif
429 
430 /// Collect all loops in function at the same nest level, starting at the
431 /// outermost level.
432 ///
433 /// This data structure collects all loops at the same nest level for a
434 /// given function (specified by the LoopInfo object). It starts at the
435 /// outermost level.
436 struct LoopDepthTree {
437  using LoopsOnLevelTy = SmallVector<LoopVector, 4>;
438  using iterator = LoopsOnLevelTy::iterator;
439  using const_iterator = LoopsOnLevelTy::const_iterator;
440 
441  LoopDepthTree(LoopInfo &LI) : Depth(1) {
442  if (!LI.empty())
443  LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend()));
444  }
445 
446  /// Test whether a given loop has been removed from the function, and thus is
447  /// no longer valid.
448  bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); }
449 
450  /// Record that a given loop has been removed from the function and is no
451  /// longer valid.
452  void removeLoop(const Loop *L) { RemovedLoops.insert(L); }
453 
454  /// Descend the tree to the next (inner) nesting level
455  void descend() {
456  LoopsOnLevelTy LoopsOnNextLevel;
457 
458  for (const LoopVector &LV : *this)
459  for (Loop *L : LV)
460  if (!isRemovedLoop(L) && L->begin() != L->end())
461  LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end()));
462 
463  LoopsOnLevel = LoopsOnNextLevel;
464  RemovedLoops.clear();
465  Depth++;
466  }
467 
468  bool empty() const { return size() == 0; }
469  size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); }
470  unsigned getDepth() const { return Depth; }
471 
472  iterator begin() { return LoopsOnLevel.begin(); }
473  iterator end() { return LoopsOnLevel.end(); }
474  const_iterator begin() const { return LoopsOnLevel.begin(); }
475  const_iterator end() const { return LoopsOnLevel.end(); }
476 
477 private:
478  /// Set of loops that have been removed from the function and are no longer
479  /// valid.
480  SmallPtrSet<const Loop *, 8> RemovedLoops;
481 
482  /// Depth of the current level, starting at 1 (outermost loops).
483  unsigned Depth;
484 
485  /// Vector of loops at the current depth level that have the same parent loop
486  LoopsOnLevelTy LoopsOnLevel;
487 };
488 
489 #ifndef NDEBUG
490 static void printLoopVector(const LoopVector &LV) {
491  dbgs() << "****************************\n";
492  for (auto L : LV)
493  printLoop(*L, dbgs());
494  dbgs() << "****************************\n";
495 }
496 #endif
497 
498 struct LoopFuser {
499 private:
500  // Sets of control flow equivalent fusion candidates for a given nest level.
501  FusionCandidateCollection FusionCandidates;
502 
503  LoopDepthTree LDT;
504  DomTreeUpdater DTU;
505 
506  LoopInfo &LI;
507  DominatorTree &DT;
508  DependenceInfo &DI;
509  ScalarEvolution &SE;
510  PostDominatorTree &PDT;
512 
513 public:
514  LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
516  OptimizationRemarkEmitter &ORE, const DataLayout &DL)
517  : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
518  DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE) {}
519 
520  /// This is the main entry point for loop fusion. It will traverse the
521  /// specified function and collect candidate loops to fuse, starting at the
522  /// outermost nesting level and working inwards.
523  bool fuseLoops(Function &F) {
524 #ifndef NDEBUG
526  LI.print(dbgs());
527  }
528 #endif
529 
530  LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName()
531  << "\n");
532  bool Changed = false;
533 
534  while (!LDT.empty()) {
535  LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth "
536  << LDT.getDepth() << "\n";);
537 
538  for (const LoopVector &LV : LDT) {
539  assert(LV.size() > 0 && "Empty loop set was build!");
540 
541  // Skip singleton loop sets as they do not offer fusion opportunities on
542  // this level.
543  if (LV.size() == 1)
544  continue;
545 #ifndef NDEBUG
547  LLVM_DEBUG({
548  dbgs() << " Visit loop set (#" << LV.size() << "):\n";
549  printLoopVector(LV);
550  });
551  }
552 #endif
553 
554  collectFusionCandidates(LV);
555  Changed |= fuseCandidates();
556  }
557 
558  // Finished analyzing candidates at this level.
559  // Descend to the next level and clear all of the candidates currently
560  // collected. Note that it will not be possible to fuse any of the
561  // existing candidates with new candidates because the new candidates will
562  // be at a different nest level and thus not be control flow equivalent
563  // with all of the candidates collected so far.
564  LLVM_DEBUG(dbgs() << "Descend one level!\n");
565  LDT.descend();
566  FusionCandidates.clear();
567  }
568 
569  if (Changed)
570  LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump(););
571 
572 #ifndef NDEBUG
573  assert(DT.verify());
574  assert(PDT.verify());
575  LI.verify(DT);
576  SE.verify();
577 #endif
578 
579  LLVM_DEBUG(dbgs() << "Loop Fusion complete\n");
580  return Changed;
581  }
582 
583 private:
584  /// Determine if two fusion candidates are control flow equivalent.
585  ///
586  /// Two fusion candidates are control flow equivalent if when one executes,
587  /// the other is guaranteed to execute. This is determined using dominators
588  /// and post-dominators: if A dominates B and B post-dominates A then A and B
589  /// are control-flow equivalent.
590  bool isControlFlowEquivalent(const FusionCandidate &FC0,
591  const FusionCandidate &FC1) const {
592  assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
593 
594  BasicBlock *FC0EntryBlock = FC0.getEntryBlock();
595  BasicBlock *FC1EntryBlock = FC1.getEntryBlock();
596 
597  if (DT.dominates(FC0EntryBlock, FC1EntryBlock))
598  return PDT.dominates(FC1EntryBlock, FC0EntryBlock);
599 
600  if (DT.dominates(FC1EntryBlock, FC0EntryBlock))
601  return PDT.dominates(FC0EntryBlock, FC1EntryBlock);
602 
603  return false;
604  }
605 
606  /// Iterate over all loops in the given loop set and identify the loops that
607  /// are eligible for fusion. Place all eligible fusion candidates into Control
608  /// Flow Equivalent sets, sorted by dominance.
609  void collectFusionCandidates(const LoopVector &LV) {
610  for (Loop *L : LV) {
611  FusionCandidate CurrCand(L, &DT, &PDT, ORE);
612  if (!CurrCand.isEligibleForFusion(SE))
613  continue;
614 
615  // Go through each list in FusionCandidates and determine if L is control
616  // flow equivalent with the first loop in that list. If it is, append LV.
617  // If not, go to the next list.
618  // If no suitable list is found, start another list and add it to
619  // FusionCandidates.
620  bool FoundSet = false;
621 
622  for (auto &CurrCandSet : FusionCandidates) {
623  if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) {
624  CurrCandSet.insert(CurrCand);
625  FoundSet = true;
626 #ifndef NDEBUG
628  LLVM_DEBUG(dbgs() << "Adding " << CurrCand
629  << " to existing candidate set\n");
630 #endif
631  break;
632  }
633  }
634  if (!FoundSet) {
635  // No set was found. Create a new set and add to FusionCandidates
636 #ifndef NDEBUG
638  LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n");
639 #endif
640  FusionCandidateSet NewCandSet;
641  NewCandSet.insert(CurrCand);
642  FusionCandidates.push_back(NewCandSet);
643  }
644  NumFusionCandidates++;
645  }
646  }
647 
648  /// Determine if it is beneficial to fuse two loops.
649  ///
650  /// For now, this method simply returns true because we want to fuse as much
651  /// as possible (primarily to test the pass). This method will evolve, over
652  /// time, to add heuristics for profitability of fusion.
653  bool isBeneficialFusion(const FusionCandidate &FC0,
654  const FusionCandidate &FC1) {
655  return true;
656  }
657 
658  /// Determine if two fusion candidates have the same trip count (i.e., they
659  /// execute the same number of iterations).
660  ///
661  /// Note that for now this method simply returns a boolean value because there
662  /// are no mechanisms in loop fusion to handle different trip counts. In the
663  /// future, this behaviour can be extended to adjust one of the loops to make
664  /// the trip counts equal (e.g., loop peeling). When this is added, this
665  /// interface may need to change to return more information than just a
666  /// boolean value.
667  bool identicalTripCounts(const FusionCandidate &FC0,
668  const FusionCandidate &FC1) const {
669  const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
670  if (isa<SCEVCouldNotCompute>(TripCount0)) {
671  UncomputableTripCount++;
672  LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
673  return false;
674  }
675 
676  const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
677  if (isa<SCEVCouldNotCompute>(TripCount1)) {
678  UncomputableTripCount++;
679  LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
680  return false;
681  }
682  LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
683  << *TripCount1 << " are "
684  << (TripCount0 == TripCount1 ? "identical" : "different")
685  << "\n");
686 
687  return (TripCount0 == TripCount1);
688  }
689 
690  /// Walk each set of control flow equivalent fusion candidates and attempt to
691  /// fuse them. This does a single linear traversal of all candidates in the
692  /// set. The conditions for legal fusion are checked at this point. If a pair
693  /// of fusion candidates passes all legality checks, they are fused together
694  /// and a new fusion candidate is created and added to the FusionCandidateSet.
695  /// The original fusion candidates are then removed, as they are no longer
696  /// valid.
697  bool fuseCandidates() {
698  bool Fused = false;
699  LLVM_DEBUG(printFusionCandidates(FusionCandidates));
700  for (auto &CandidateSet : FusionCandidates) {
701  if (CandidateSet.size() < 2)
702  continue;
703 
704  LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n"
705  << CandidateSet << "\n");
706 
707  for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) {
708  assert(!LDT.isRemovedLoop(FC0->L) &&
709  "Should not have removed loops in CandidateSet!");
710  auto FC1 = FC0;
711  for (++FC1; FC1 != CandidateSet.end(); ++FC1) {
712  assert(!LDT.isRemovedLoop(FC1->L) &&
713  "Should not have removed loops in CandidateSet!");
714 
715  LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump();
716  dbgs() << " with\n"; FC1->dump(); dbgs() << "\n");
717 
718  FC0->verify();
719  FC1->verify();
720 
721  if (!identicalTripCounts(*FC0, *FC1)) {
722  LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
723  "counts. Not fusing.\n");
724  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
725  NonEqualTripCount);
726  continue;
727  }
728 
729  if (!isAdjacent(*FC0, *FC1)) {
730  LLVM_DEBUG(dbgs()
731  << "Fusion candidates are not adjacent. Not fusing.\n");
732  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);
733  continue;
734  }
735 
736  // Ensure that FC0 and FC1 have identical guards.
737  // If one (or both) are not guarded, this check is not necessary.
738  if (FC0->GuardBranch && FC1->GuardBranch &&
739  !haveIdenticalGuards(*FC0, *FC1)) {
740  LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
741  "guards. Not Fusing.\n");
742  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
743  NonIdenticalGuards);
744  continue;
745  }
746 
747  // The following three checks look for empty blocks in FC0 and FC1. If
748  // any of these blocks are non-empty, we do not fuse. This is done
749  // because we currently do not have the safety checks to determine if
750  // it is safe to move the blocks past other blocks in the loop. Once
751  // these checks are added, these conditions can be relaxed.
752  if (!isEmptyPreheader(*FC1)) {
753  LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "
754  "preheader. Not fusing.\n");
755  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
756  NonEmptyPreheader);
757  continue;
758  }
759 
760  if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) {
761  LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit "
762  "block. Not fusing.\n");
763  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
764  NonEmptyExitBlock);
765  continue;
766  }
767 
768  if (FC1->GuardBranch && !isEmptyGuardBlock(*FC1)) {
769  LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty guard "
770  "block. Not fusing.\n");
771  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
772  NonEmptyGuardBlock);
773  continue;
774  }
775 
776  // Check the dependencies across the loops and do not fuse if it would
777  // violate them.
778  if (!dependencesAllowFusion(*FC0, *FC1)) {
779  LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
780  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
781  InvalidDependencies);
782  continue;
783  }
784 
785  bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1);
786  LLVM_DEBUG(dbgs()
787  << "\tFusion appears to be "
788  << (BeneficialToFuse ? "" : "un") << "profitable!\n");
789  if (!BeneficialToFuse) {
790  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
791  FusionNotBeneficial);
792  continue;
793  }
794  // All analysis has completed and has determined that fusion is legal
795  // and profitable. At this point, start transforming the code and
796  // perform fusion.
797 
798  LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
799  << *FC1 << "\n");
800 
801  // Report fusion to the Optimization Remarks.
802  // Note this needs to be done *before* performFusion because
803  // performFusion will change the original loops, making it not
804  // possible to identify them after fusion is complete.
805  reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter);
806 
807  FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE);
808  FusedCand.verify();
809  assert(FusedCand.isEligibleForFusion(SE) &&
810  "Fused candidate should be eligible for fusion!");
811 
812  // Notify the loop-depth-tree that these loops are not valid objects
813  LDT.removeLoop(FC1->L);
814 
815  CandidateSet.erase(FC0);
816  CandidateSet.erase(FC1);
817 
818  auto InsertPos = CandidateSet.insert(FusedCand);
819 
820  assert(InsertPos.second &&
821  "Unable to insert TargetCandidate in CandidateSet!");
822 
823  // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations
824  // of the FC1 loop will attempt to fuse the new (fused) loop with the
825  // remaining candidates in the current candidate set.
826  FC0 = FC1 = InsertPos.first;
827 
828  LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet
829  << "\n");
830 
831  Fused = true;
832  }
833  }
834  }
835  return Fused;
836  }
837 
838  /// Rewrite all additive recurrences in a SCEV to use a new loop.
839  class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> {
840  public:
841  AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL,
842  bool UseMax = true)
843  : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL),
844  NewL(NewL) {}
845 
846  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
847  const Loop *ExprL = Expr->getLoop();
849  if (ExprL == &OldL) {
850  Operands.append(Expr->op_begin(), Expr->op_end());
851  return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags());
852  }
853 
854  if (OldL.contains(ExprL)) {
855  bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE));
856  if (!UseMax || !Pos || !Expr->isAffine()) {
857  Valid = false;
858  return Expr;
859  }
860  return visit(Expr->getStart());
861  }
862 
863  for (const SCEV *Op : Expr->operands())
864  Operands.push_back(visit(Op));
865  return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags());
866  }
867 
868  bool wasValidSCEV() const { return Valid; }
869 
870  private:
871  bool Valid, UseMax;
872  const Loop &OldL, &NewL;
873  };
874 
875  /// Return false if the access functions of \p I0 and \p I1 could cause
876  /// a negative dependence.
877  bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0,
878  Instruction &I1, bool EqualIsInvalid) {
879  Value *Ptr0 = getLoadStorePointerOperand(&I0);
880  Value *Ptr1 = getLoadStorePointerOperand(&I1);
881  if (!Ptr0 || !Ptr1)
882  return false;
883 
884  const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0);
885  const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1);
886 #ifndef NDEBUG
888  LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs "
889  << *SCEVPtr1 << "\n");
890 #endif
891  AddRecLoopReplacer Rewriter(SE, L0, L1);
892  SCEVPtr0 = Rewriter.visit(SCEVPtr0);
893 #ifndef NDEBUG
895  LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0
896  << " [Valid: " << Rewriter.wasValidSCEV() << "]\n");
897 #endif
898  if (!Rewriter.wasValidSCEV())
899  return false;
900 
901  // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by
902  // L0) and the other is not. We could check if it is monotone and test
903  // the beginning and end value instead.
904 
905  BasicBlock *L0Header = L0.getHeader();
906  auto HasNonLinearDominanceRelation = [&](const SCEV *S) {
907  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S);
908  if (!AddRec)
909  return false;
910  return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) &&
911  !DT.dominates(AddRec->getLoop()->getHeader(), L0Header);
912  };
913  if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation))
914  return false;
915 
916  ICmpInst::Predicate Pred =
917  EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE;
918  bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1);
919 #ifndef NDEBUG
921  LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0
922  << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1
923  << "\n");
924 #endif
925  return IsAlwaysGE;
926  }
927 
928  /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in
929  /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses
930  /// specified by @p DepChoice are used to determine this.
931  bool dependencesAllowFusion(const FusionCandidate &FC0,
932  const FusionCandidate &FC1, Instruction &I0,
933  Instruction &I1, bool AnyDep,
934  FusionDependenceAnalysisChoice DepChoice) {
935 #ifndef NDEBUG
937  LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : "
938  << DepChoice << "\n");
939  }
940 #endif
941  switch (DepChoice) {
943  return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep);
945  auto DepResult = DI.depends(&I0, &I1, true);
946  if (!DepResult)
947  return true;
948 #ifndef NDEBUG
950  LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs());
951  dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: "
952  << (DepResult->isOrdered() ? "true" : "false")
953  << "]\n");
954  LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels()
955  << "\n");
956  }
957 #endif
958 
959  if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor())
960  LLVM_DEBUG(
961  dbgs() << "TODO: Implement pred/succ dependence handling!\n");
962 
963  // TODO: Can we actually use the dependence info analysis here?
964  return false;
965  }
966 
968  return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
970  dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
972  }
973 
974  llvm_unreachable("Unknown fusion dependence analysis choice!");
975  }
976 
977  /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused.
978  bool dependencesAllowFusion(const FusionCandidate &FC0,
979  const FusionCandidate &FC1) {
980  LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
981  << "\n");
982  assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
983  assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
984 
985  for (Instruction *WriteL0 : FC0.MemWrites) {
986  for (Instruction *WriteL1 : FC1.MemWrites)
987  if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
988  /* AnyDep */ false,
990  InvalidDependencies++;
991  return false;
992  }
993  for (Instruction *ReadL1 : FC1.MemReads)
994  if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1,
995  /* AnyDep */ false,
997  InvalidDependencies++;
998  return false;
999  }
1000  }
1001 
1002  for (Instruction *WriteL1 : FC1.MemWrites) {
1003  for (Instruction *WriteL0 : FC0.MemWrites)
1004  if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
1005  /* AnyDep */ false,
1007  InvalidDependencies++;
1008  return false;
1009  }
1010  for (Instruction *ReadL0 : FC0.MemReads)
1011  if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1,
1012  /* AnyDep */ false,
1014  InvalidDependencies++;
1015  return false;
1016  }
1017  }
1018 
1019  // Walk through all uses in FC1. For each use, find the reaching def. If the
1020  // def is located in FC0 then it is is not safe to fuse.
1021  for (BasicBlock *BB : FC1.L->blocks())
1022  for (Instruction &I : *BB)
1023  for (auto &Op : I.operands())
1024  if (Instruction *Def = dyn_cast<Instruction>(Op))
1025  if (FC0.L->contains(Def->getParent())) {
1026  InvalidDependencies++;
1027  return false;
1028  }
1029 
1030  return true;
1031  }
1032 
1033  /// Determine if two fusion candidates are adjacent in the CFG.
1034  ///
1035  /// This method will determine if there are additional basic blocks in the CFG
1036  /// between the exit of \p FC0 and the entry of \p FC1.
1037  /// If the two candidates are guarded loops, then it checks whether the
1038  /// non-loop successor of the \p FC0 guard branch is the entry block of \p
1039  /// FC1. If not, then the loops are not adjacent. If the two candidates are
1040  /// not guarded loops, then it checks whether the exit block of \p FC0 is the
1041  /// preheader of \p FC1.
1042  bool isAdjacent(const FusionCandidate &FC0,
1043  const FusionCandidate &FC1) const {
1044  // If the successor of the guard branch is FC1, then the loops are adjacent
1045  if (FC0.GuardBranch)
1046  return FC0.getNonLoopBlock() == FC1.getEntryBlock();
1047  else
1048  return FC0.ExitBlock == FC1.getEntryBlock();
1049  }
1050 
1051  /// Determine if two fusion candidates have identical guards
1052  ///
1053  /// This method will determine if two fusion candidates have the same guards.
1054  /// The guards are considered the same if:
1055  /// 1. The instructions to compute the condition used in the compare are
1056  /// identical.
1057  /// 2. The successors of the guard have the same flow into/around the loop.
1058  /// If the compare instructions are identical, then the first successor of the
1059  /// guard must go to the same place (either the preheader of the loop or the
1060  /// NonLoopBlock). In other words, the the first successor of both loops must
1061  /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
1062  /// the NonLoopBlock). The same must be true for the second successor.
1063  bool haveIdenticalGuards(const FusionCandidate &FC0,
1064  const FusionCandidate &FC1) const {
1065  assert(FC0.GuardBranch && FC1.GuardBranch &&
1066  "Expecting FC0 and FC1 to be guarded loops.");
1067 
1068  if (auto FC0CmpInst =
1069  dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))
1070  if (auto FC1CmpInst =
1071  dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))
1072  if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))
1073  return false;
1074 
1075  // The compare instructions are identical.
1076  // Now make sure the successor of the guards have the same flow into/around
1077  // the loop
1078  if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)
1079  return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);
1080  else
1081  return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
1082  }
1083 
1084  /// Check that the guard for \p FC *only* contains the cmp/branch for the
1085  /// guard.
1086  /// Once we are able to handle intervening code, any code in the guard block
1087  /// for FC1 will need to be treated as intervening code and checked whether
1088  /// it can safely move around the loops.
1089  bool isEmptyGuardBlock(const FusionCandidate &FC) const {
1090  assert(FC.GuardBranch && "Expecting a fusion candidate with guard branch.");
1091  if (auto *CmpInst = dyn_cast<Instruction>(FC.GuardBranch->getCondition())) {
1092  auto *GuardBlock = FC.GuardBranch->getParent();
1093  // If the generation of the cmp value is in GuardBlock, then the size of
1094  // the guard block should be 2 (cmp + branch). If the generation of the
1095  // cmp value is in a different block, then the size of the guard block
1096  // should only be 1.
1097  if (CmpInst->getParent() == GuardBlock)
1098  return GuardBlock->size() == 2;
1099  else
1100  return GuardBlock->size() == 1;
1101  }
1102 
1103  return false;
1104  }
1105 
1106  bool isEmptyPreheader(const FusionCandidate &FC) const {
1107  assert(FC.Preheader && "Expecting a valid preheader");
1108  return FC.Preheader->size() == 1;
1109  }
1110 
1111  bool isEmptyExitBlock(const FusionCandidate &FC) const {
1112  assert(FC.ExitBlock && "Expecting a valid exit block");
1113  return FC.ExitBlock->size() == 1;
1114  }
1115 
1116  /// Fuse two fusion candidates, creating a new fused loop.
1117  ///
1118  /// This method contains the mechanics of fusing two loops, represented by \p
1119  /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1
1120  /// postdominates \p FC0 (making them control flow equivalent). It also
1121  /// assumes that the other conditions for fusion have been met: adjacent,
1122  /// identical trip counts, and no negative distance dependencies exist that
1123  /// would prevent fusion. Thus, there is no checking for these conditions in
1124  /// this method.
1125  ///
1126  /// Fusion is performed by rewiring the CFG to update successor blocks of the
1127  /// components of tho loop. Specifically, the following changes are done:
1128  ///
1129  /// 1. The preheader of \p FC1 is removed as it is no longer necessary
1130  /// (because it is currently only a single statement block).
1131  /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1.
1132  /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0.
1133  /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0.
1134  ///
1135  /// All of these modifications are done with dominator tree updates, thus
1136  /// keeping the dominator (and post dominator) information up-to-date.
1137  ///
1138  /// This can be improved in the future by actually merging blocks during
1139  /// fusion. For example, the preheader of \p FC1 can be merged with the
1140  /// preheader of \p FC0. This would allow loops with more than a single
1141  /// statement in the preheader to be fused. Similarly, the latch blocks of the
1142  /// two loops could also be fused into a single block. This will require
1143  /// analysis to prove it is safe to move the contents of the block past
1144  /// existing code, which currently has not been implemented.
1145  Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) {
1146  assert(FC0.isValid() && FC1.isValid() &&
1147  "Expecting valid fusion candidates");
1148 
1149  LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
1150  dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
1151 
1152  // Fusing guarded loops is handled slightly differently than non-guarded
1153  // loops and has been broken out into a separate method instead of trying to
1154  // intersperse the logic within a single method.
1155  if (FC0.GuardBranch)
1156  return fuseGuardedLoops(FC0, FC1);
1157 
1158  assert(FC1.Preheader == FC0.ExitBlock);
1159  assert(FC1.Preheader->size() == 1 &&
1160  FC1.Preheader->getSingleSuccessor() == FC1.Header);
1161 
1162  // Remember the phi nodes originally in the header of FC0 in order to rewire
1163  // them later. However, this is only necessary if the new loop carried
1164  // values might not dominate the exiting branch. While we do not generally
1165  // test if this is the case but simply insert intermediate phi nodes, we
1166  // need to make sure these intermediate phi nodes have different
1167  // predecessors. To this end, we filter the special case where the exiting
1168  // block is the latch block of the first loop. Nothing needs to be done
1169  // anyway as all loop carried values dominate the latch and thereby also the
1170  // exiting branch.
1171  SmallVector<PHINode *, 8> OriginalFC0PHIs;
1172  if (FC0.ExitingBlock != FC0.Latch)
1173  for (PHINode &PHI : FC0.Header->phis())
1174  OriginalFC0PHIs.push_back(&PHI);
1175 
1176  // Replace incoming blocks for header PHIs first.
1177  FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
1178  FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
1179 
1180  // Then modify the control flow and update DT and PDT.
1182 
1183  // The old exiting block of the first loop (FC0) has to jump to the header
1184  // of the second as we need to execute the code in the second header block
1185  // regardless of the trip count. That is, if the trip count is 0, so the
1186  // back edge is never taken, we still have to execute both loop headers,
1187  // especially (but not only!) if the second is a do-while style loop.
1188  // However, doing so might invalidate the phi nodes of the first loop as
1189  // the new values do only need to dominate their latch and not the exiting
1190  // predicate. To remedy this potential problem we always introduce phi
1191  // nodes in the header of the second loop later that select the loop carried
1192  // value, if the second header was reached through an old latch of the
1193  // first, or undef otherwise. This is sound as exiting the first implies the
1194  // second will exit too, __without__ taking the back-edge. [Their
1195  // trip-counts are equal after all.
1196  // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go
1197  // to FC1.Header? I think this is basically what the three sequences are
1198  // trying to accomplish; however, doing this directly in the CFG may mean
1199  // the DT/PDT becomes invalid
1200  FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
1201  FC1.Header);
1203  DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
1205  DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
1206 
1207  // The pre-header of L1 is not necessary anymore.
1208  assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader));
1209  FC1.Preheader->getTerminator()->eraseFromParent();
1210  new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
1212  DominatorTree::Delete, FC1.Preheader, FC1.Header));
1213 
1214  // Moves the phi nodes from the second to the first loops header block.
1215  while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
1216  if (SE.isSCEVable(PHI->getType()))
1217  SE.forgetValue(PHI);
1218  if (PHI->hasNUsesOrMore(1))
1219  PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
1220  else
1221  PHI->eraseFromParent();
1222  }
1223 
1224  // Introduce new phi nodes in the second loop header to ensure
1225  // exiting the first and jumping to the header of the second does not break
1226  // the SSA property of the phis originally in the first loop. See also the
1227  // comment above.
1228  Instruction *L1HeaderIP = &FC1.Header->front();
1229  for (PHINode *LCPHI : OriginalFC0PHIs) {
1230  int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
1231  assert(L1LatchBBIdx >= 0 &&
1232  "Expected loop carried value to be rewired at this point!");
1233 
1234  Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
1235 
1236  PHINode *L1HeaderPHI = PHINode::Create(
1237  LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
1238  L1HeaderPHI->addIncoming(LCV, FC0.Latch);
1239  L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
1240  FC0.ExitingBlock);
1241 
1242  LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
1243  }
1244 
1245  // Replace latch terminator destinations.
1246  FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
1247  FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
1248 
1249  // If FC0.Latch and FC0.ExitingBlock are the same then we have already
1250  // performed the updates above.
1251  if (FC0.Latch != FC0.ExitingBlock)
1253  DominatorTree::Insert, FC0.Latch, FC1.Header));
1254 
1256  FC0.Latch, FC0.Header));
1258  FC1.Latch, FC0.Header));
1260  FC1.Latch, FC1.Header));
1261 
1262  // Update DT/PDT
1263  DTU.applyUpdates(TreeUpdates);
1264 
1265  LI.removeBlock(FC1.Preheader);
1266  DTU.deleteBB(FC1.Preheader);
1267  DTU.flush();
1268 
1269  // Is there a way to keep SE up-to-date so we don't need to forget the loops
1270  // and rebuild the information in subsequent passes of fusion?
1271  SE.forgetLoop(FC1.L);
1272  SE.forgetLoop(FC0.L);
1273 
1274  // Merge the loops.
1275  SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(),
1276  FC1.L->block_end());
1277  for (BasicBlock *BB : Blocks) {
1278  FC0.L->addBlockEntry(BB);
1279  FC1.L->removeBlockFromLoop(BB);
1280  if (LI.getLoopFor(BB) != FC1.L)
1281  continue;
1282  LI.changeLoopFor(BB, FC0.L);
1283  }
1284  while (!FC1.L->empty()) {
1285  const auto &ChildLoopIt = FC1.L->begin();
1286  Loop *ChildLoop = *ChildLoopIt;
1287  FC1.L->removeChildLoop(ChildLoopIt);
1288  FC0.L->addChildLoop(ChildLoop);
1289  }
1290 
1291  // Delete the now empty loop L1.
1292  LI.erase(FC1.L);
1293 
1294 #ifndef NDEBUG
1295  assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
1297  assert(PDT.verify());
1298  LI.verify(DT);
1299  SE.verify();
1300 #endif
1301 
1302  LLVM_DEBUG(dbgs() << "Fusion done:\n");
1303 
1304  return FC0.L;
1305  }
1306 
1307  /// Report details on loop fusion opportunities.
1308  ///
1309  /// This template function can be used to report both successful and missed
1310  /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
1311  /// be one of:
1312  /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful
1313  /// given two valid fusion candidates.
1314  /// - OptimizationRemark to report successful fusion of two fusion
1315  /// candidates.
1316  /// The remarks will be printed using the form:
1317  /// <path/filename>:<line number>:<column number>: [<function name>]:
1318  /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
1319  template <typename RemarkKind>
1320  void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
1321  llvm::Statistic &Stat) {
1322  assert(FC0.Preheader && FC1.Preheader &&
1323  "Expecting valid fusion candidates");
1324  using namespace ore;
1325  ++Stat;
1326  ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),
1327  FC0.Preheader)
1328  << "[" << FC0.Preheader->getParent()->getName()
1329  << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))
1330  << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
1331  << ": " << Stat.getDesc());
1332  }
1333 
1334  /// Fuse two guarded fusion candidates, creating a new fused loop.
1335  ///
1336  /// Fusing guarded loops is handled much the same way as fusing non-guarded
1337  /// loops. The rewiring of the CFG is slightly different though, because of
1338  /// the presence of the guards around the loops and the exit blocks after the
1339  /// loop body. As such, the new loop is rewired as follows:
1340  /// 1. Keep the guard branch from FC0 and use the non-loop block target
1341  /// from the FC1 guard branch.
1342  /// 2. Remove the exit block from FC0 (this exit block should be empty
1343  /// right now).
1344  /// 3. Remove the guard branch for FC1
1345  /// 4. Remove the preheader for FC1.
1346  /// The exit block successor for the latch of FC0 is updated to be the header
1347  /// of FC1 and the non-exit block successor of the latch of FC1 is updated to
1348  /// be the header of FC0, thus creating the fused loop.
1349  Loop *fuseGuardedLoops(const FusionCandidate &FC0,
1350  const FusionCandidate &FC1) {
1351  assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");
1352 
1353  BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
1354  BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
1355  BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
1356  BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
1357 
1358  assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
1359 
1361 
1362  ////////////////////////////////////////////////////////////////////////////
1363  // Update the Loop Guard
1364  ////////////////////////////////////////////////////////////////////////////
1365  // The guard for FC0 is updated to guard both FC0 and FC1. This is done by
1366  // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
1367  // Thus, one path from the guard goes to the preheader for FC0 (and thus
1368  // executes the new fused loop) and the other path goes to the NonLoopBlock
1369  // for FC1 (where FC1 guard would have gone if FC1 was not executed).
1370  FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
1371  FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock,
1372  FC1.Header);
1373 
1374  // The guard of FC1 is not necessary anymore.
1375  FC1.GuardBranch->eraseFromParent();
1376  new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);
1377 
1379  DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
1381  DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
1383  DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
1385  DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
1386 
1387  assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
1388  "Expecting guard block to have no predecessors");
1389  assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
1390  "Expecting guard block to have no successors");
1391 
1392  // Remember the phi nodes originally in the header of FC0 in order to rewire
1393  // them later. However, this is only necessary if the new loop carried
1394  // values might not dominate the exiting branch. While we do not generally
1395  // test if this is the case but simply insert intermediate phi nodes, we
1396  // need to make sure these intermediate phi nodes have different
1397  // predecessors. To this end, we filter the special case where the exiting
1398  // block is the latch block of the first loop. Nothing needs to be done
1399  // anyway as all loop carried values dominate the latch and thereby also the
1400  // exiting branch.
1401  // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
1402  // (because the loops are rotated. Thus, nothing will ever be added to
1403  // OriginalFC0PHIs.
1404  SmallVector<PHINode *, 8> OriginalFC0PHIs;
1405  if (FC0.ExitingBlock != FC0.Latch)
1406  for (PHINode &PHI : FC0.Header->phis())
1407  OriginalFC0PHIs.push_back(&PHI);
1408 
1409  assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");
1410 
1411  // Replace incoming blocks for header PHIs first.
1412  FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
1413  FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
1414 
1415  // The old exiting block of the first loop (FC0) has to jump to the header
1416  // of the second as we need to execute the code in the second header block
1417  // regardless of the trip count. That is, if the trip count is 0, so the
1418  // back edge is never taken, we still have to execute both loop headers,
1419  // especially (but not only!) if the second is a do-while style loop.
1420  // However, doing so might invalidate the phi nodes of the first loop as
1421  // the new values do only need to dominate their latch and not the exiting
1422  // predicate. To remedy this potential problem we always introduce phi
1423  // nodes in the header of the second loop later that select the loop carried
1424  // value, if the second header was reached through an old latch of the
1425  // first, or undef otherwise. This is sound as exiting the first implies the
1426  // second will exit too, __without__ taking the back-edge (their
1427  // trip-counts are equal after all).
1428  FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
1429  FC1.Header);
1430 
1432  DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
1434  DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
1435 
1436  // Remove FC0 Exit Block
1437  // The exit block for FC0 is no longer needed since control will flow
1438  // directly to the header of FC1. Since it is an empty block, it can be
1439  // removed at this point.
1440  // TODO: In the future, we can handle non-empty exit blocks my merging any
1441  // instructions from FC0 exit block into FC1 exit block prior to removing
1442  // the block.
1443  assert(pred_begin(FC0.ExitBlock) == pred_end(FC0.ExitBlock) &&
1444  "Expecting exit block to be empty");
1445  FC0.ExitBlock->getTerminator()->eraseFromParent();
1446  new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
1447 
1448  // Remove FC1 Preheader
1449  // The pre-header of L1 is not necessary anymore.
1450  assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader));
1451  FC1.Preheader->getTerminator()->eraseFromParent();
1452  new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
1454  DominatorTree::Delete, FC1.Preheader, FC1.Header));
1455 
1456  // Moves the phi nodes from the second to the first loops header block.
1457  while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
1458  if (SE.isSCEVable(PHI->getType()))
1459  SE.forgetValue(PHI);
1460  if (PHI->hasNUsesOrMore(1))
1461  PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
1462  else
1463  PHI->eraseFromParent();
1464  }
1465 
1466  // Introduce new phi nodes in the second loop header to ensure
1467  // exiting the first and jumping to the header of the second does not break
1468  // the SSA property of the phis originally in the first loop. See also the
1469  // comment above.
1470  Instruction *L1HeaderIP = &FC1.Header->front();
1471  for (PHINode *LCPHI : OriginalFC0PHIs) {
1472  int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
1473  assert(L1LatchBBIdx >= 0 &&
1474  "Expected loop carried value to be rewired at this point!");
1475 
1476  Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
1477 
1478  PHINode *L1HeaderPHI = PHINode::Create(
1479  LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
1480  L1HeaderPHI->addIncoming(LCV, FC0.Latch);
1481  L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
1482  FC0.ExitingBlock);
1483 
1484  LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
1485  }
1486 
1487  // Update the latches
1488 
1489  // Replace latch terminator destinations.
1490  FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
1491  FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
1492 
1493  // If FC0.Latch and FC0.ExitingBlock are the same then we have already
1494  // performed the updates above.
1495  if (FC0.Latch != FC0.ExitingBlock)
1497  DominatorTree::Insert, FC0.Latch, FC1.Header));
1498 
1500  FC0.Latch, FC0.Header));
1502  FC1.Latch, FC0.Header));
1504  FC1.Latch, FC1.Header));
1505 
1506  // All done
1507  // Apply the updates to the Dominator Tree and cleanup.
1508 
1509  assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
1510  "FC1GuardBlock has successors!!");
1511  assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
1512  "FC1GuardBlock has predecessors!!");
1513 
1514  // Update DT/PDT
1515  DTU.applyUpdates(TreeUpdates);
1516 
1517  LI.removeBlock(FC1.Preheader);
1518  DTU.deleteBB(FC1.Preheader);
1519  DTU.deleteBB(FC0.ExitBlock);
1520  DTU.flush();
1521 
1522  // Is there a way to keep SE up-to-date so we don't need to forget the loops
1523  // and rebuild the information in subsequent passes of fusion?
1524  SE.forgetLoop(FC1.L);
1525  SE.forgetLoop(FC0.L);
1526 
1527  // Merge the loops.
1528  SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(),
1529  FC1.L->block_end());
1530  for (BasicBlock *BB : Blocks) {
1531  FC0.L->addBlockEntry(BB);
1532  FC1.L->removeBlockFromLoop(BB);
1533  if (LI.getLoopFor(BB) != FC1.L)
1534  continue;
1535  LI.changeLoopFor(BB, FC0.L);
1536  }
1537  while (!FC1.L->empty()) {
1538  const auto &ChildLoopIt = FC1.L->begin();
1539  Loop *ChildLoop = *ChildLoopIt;
1540  FC1.L->removeChildLoop(ChildLoopIt);
1541  FC0.L->addChildLoop(ChildLoop);
1542  }
1543 
1544  // Delete the now empty loop L1.
1545  LI.erase(FC1.L);
1546 
1547 #ifndef NDEBUG
1548  assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
1550  assert(PDT.verify());
1551  LI.verify(DT);
1552  SE.verify();
1553 #endif
1554 
1555  LLVM_DEBUG(dbgs() << "Fusion done:\n");
1556 
1557  return FC0.L;
1558  }
1559 };
1560 
1561 struct LoopFuseLegacy : public FunctionPass {
1562 
1563  static char ID;
1564 
1565  LoopFuseLegacy() : FunctionPass(ID) {
1567  }
1568 
1569  void getAnalysisUsage(AnalysisUsage &AU) const override {
1577 
1582  }
1583 
1584  bool runOnFunction(Function &F) override {
1585  if (skipFunction(F))
1586  return false;
1587  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1588  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1589  auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
1590  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1591  auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
1592  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1593 
1594  const DataLayout &DL = F.getParent()->getDataLayout();
1595  LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL);
1596  return LF.fuseLoops(F);
1597  }
1598 };
1599 } // namespace
1600 
1602  auto &LI = AM.getResult<LoopAnalysis>(F);
1603  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
1604  auto &DI = AM.getResult<DependenceAnalysis>(F);
1605  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
1606  auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
1608 
1609  const DataLayout &DL = F.getParent()->getDataLayout();
1610  LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL);
1611  bool Changed = LF.fuseLoops(F);
1612  if (!Changed)
1613  return PreservedAnalyses::all();
1614 
1615  PreservedAnalyses PA;
1619  PA.preserve<LoopAnalysis>();
1620  return PA;
1621 }
1622 
1623 char LoopFuseLegacy::ID = 0;
1624 
1625 INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false,
1626  false)
1633 INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false)
1634 
1635 FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); }
void initializeLoopFuseLegacyPass(PassRegistry &)
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:641
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:722
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
raw_ostream & errs()
This returns a reference to a raw_ostream for standard error.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:209
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
bool empty() const
Definition: LoopInfo.h:907
DiagnosticInfoOptimizationBase::Argument NV
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:777
AnalysisPass to compute dependence information in a function.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:484
Legacy pass manager pass to access dependence information.
The main scalar evolution driver.
This file implements the Loop Fusion pass.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:160
BasicBlock * getSuccessor(unsigned i) const
STATISTIC(NumFunctions, "Total number of functions")
void flush()
Apply all pending updates to available trees and flush all BasicBlocks awaiting deletion.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:230
F(f)
block Block Frequency true
An instruction for reading from memory.
Definition: Instructions.h:169
INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false) FunctionPass *llvm
Definition: LoopFuse.cpp:1625
DependenceInfo - This class is the main dependence-analysis driver.
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:32
bool isInvalid() const
Return true if this loop is no longer valid.
Definition: LoopInfo.h:198
void verify(const DominatorTreeBase< BlockT, false > &DomTree) const
Definition: LoopInfoImpl.h:681
static cl::opt< FusionDependenceAnalysisChoice > FusionDependenceAnalysis("loop-fusion-dependence-analysis", cl::desc("Which dependence analysis should loop fusion use?"), cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev", "Use the scalar evolution interface"), clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da", "Use the dependence analysis interface"), clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all", "Use all available analyses")), cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore)
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4429
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:928
Diagnostic information for optimization analysis remarks.
static cl::opt< bool > VerboseFusionDebugging("loop-fusion-verbose-debug", cl::desc("Enable verbose debugging for Loop Fusion"), cl::Hidden, cl::init(false), cl::ZeroOrMore)
void erase(Loop *L)
Update LoopInfo after removing the last backedge from a loop.
Definition: LoopInfo.cpp:857
friend const_iterator begin(StringRef path, Style style)
Get begin iterator over path.
Definition: Path.cpp:224
const char * getName() const
Definition: Statistic.h:57
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:1183
const char * getDesc() const
Definition: Statistic.h:58
mir Rename Register Operands
BranchInst * getLoopGuardBranch() const
Return the loop guard branch, if it exists.
Definition: LoopInfo.cpp:362
BlockT * getHeader() const
Definition: LoopInfo.h:105
Interval::succ_iterator succ_begin(Interval *I)
succ_begin/succ_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:102
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS...
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:246
This node represents a polynomial recurrence on the trip count of the specified loop.
An instruction for storing to memory.
Definition: Instructions.h:325
op_iterator op_begin() const
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
Interval::succ_iterator succ_end(Interval *I)
Definition: Interval.h:105
static bool runOnFunction(Function &F, bool PostInlining)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
friend const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool PossiblyLoopIndependent)
depends - Tests for a dependence between the Src and Dst instructions.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:154
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop...
Definition: LoopInfo.h:208
void deleteBB(BasicBlock *DelBB)
Delete DelBB.
Conditional or Unconditional Branch instruction.
This function has undefined behavior.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:652
Interval::pred_iterator pred_begin(Interval *I)
pred_begin/pred_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:112
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopFuse.cpp:1601
const SCEV * getSCEVAtScope(const SCEV *S, const Loop *L)
Return a SCEV expression for the specified value at the specified scope in the program.
Represent the analysis usage information of a pass.
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:42
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:732
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
Interval::pred_iterator pred_end(Interval *I)
Definition: Interval.h:115
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values...
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:75
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1446
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:160
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:611
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool verify(const TargetRegisterInfo &TRI) const
Check that information hold by this instance make sense for the given TRI.
signed greater than
Definition: InstrTypes.h:759
char & LoopSimplifyID
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:197
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
void print(raw_ostream &OS) const
Definition: LoopInfoImpl.h:610
auto size(R &&Range, typename std::enable_if< std::is_same< typename std::iterator_traits< decltype(Range.begin())>::iterator_category, std::random_access_iterator_tag >::value, void >::type *=nullptr) -> decltype(std::distance(Range.begin(), Range.end()))
Get the size of a range.
Definition: STLExtras.h:1146
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file. ...
bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
void applyUpdates(ArrayRef< DominatorTree::UpdateType > Updates)
Submit updates to all available trees.
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:314
reverse_iterator rend() const
Definition: LoopInfo.h:906
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:248
Analysis pass which computes a PostDominatorTree.
static unsigned getDepth(const SmallVectorImpl< const MachineBasicBlock *> &Stack, const MachineBasicBlock *MBB)
iterator begin() const
Definition: LoopInfo.h:147
bool isConditional() const
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
#define DEBUG_TYPE
Definition: LoopFuse.cpp:67
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:387
reverse_iterator rbegin() const
Definition: LoopInfo.h:905
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:627
FunctionPass * createLoopFusePass()
Analysis pass that exposes the ScalarEvolution for a function.
Virtual Register Rewriter
Definition: VirtRegMap.cpp:220
bool isLoopSimplifyForm() const
Return true if the Loop is in the form that the LoopSimplify form transforms loops to...
Definition: LoopInfo.cpp:464
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
This class represents an analyzed expression in the program.
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
StringRef getName() const
Definition: LoopInfo.h:827
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:509
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
#define I(x, y, z)
Definition: MD5.cpp:58
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:5103
iterator end() const
Definition: LoopInfo.h:148
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
void changeLoopFor(BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
Definition: LoopInfo.h:960
void preserve()
Mark an analysis as preserved.
Definition: PassManager.h:175
FusionDependenceAnalysisChoice
Definition: LoopFuse.cpp:92
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
Definition: APInt.h:2047
const SCEV * getBackedgeTakenCount(const Loop *L)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
LLVM Value Representation.
Definition: Value.h:74
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
OptimizationRemarkEmitter legacy analysis pass.
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:45
The legacy pass manager&#39;s analysis pass to compute loop information.
Definition: LoopInfo.h:1208
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
A container for analyses that lazily runs them and caches their results.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:259
#define LLVM_DEBUG(X)
Definition: Debug.h:122
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:161
void printLoop(Loop &L, raw_ostream &OS, const std::string &Banner="")
Function to print a loop&#39;s contents as LLVM&#39;s text IR assembly.
Definition: LoopInfo.cpp:936
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:49
NoWrapFlags getNoWrapFlags(NoWrapFlags Mask=NoWrapMask) const
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count...
The optimization diagnostic interface.
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
Definition: LoopInfo.h:987
This visitor recursively visits a SCEV expression and re-writes it.
signed greater or equal
Definition: InstrTypes.h:760
const BasicBlock * getParent() const
Definition: Instruction.h:66