LLVM  10.0.0svn
LoopFuse.cpp
Go to the documentation of this file.
1 //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements the loop fusion pass.
11 /// The implementation is largely based on the following document:
12 ///
13 /// Code Transformations to Augment the Scope of Loop Fusion in a
14 /// Production Compiler
15 /// Christopher Mark Barton
16 /// MSc Thesis
17 /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf
18 ///
19 /// The general approach taken is to collect sets of control flow equivalent
20 /// loops and test whether they can be fused. The necessary conditions for
21 /// fusion are:
22 /// 1. The loops must be adjacent (there cannot be any statements between
23 /// the two loops).
24 /// 2. The loops must be conforming (they must execute the same number of
25 /// iterations).
26 /// 3. The loops must be control flow equivalent (if one loop executes, the
27 /// other is guaranteed to execute).
28 /// 4. There cannot be any negative distance dependencies between the loops.
29 /// If all of these conditions are satisfied, it is safe to fuse the loops.
30 ///
31 /// This implementation creates FusionCandidates that represent the loop and the
32 /// necessary information needed by fusion. It then operates on the fusion
33 /// candidates, first confirming that the candidate is eligible for fusion. The
34 /// candidates are then collected into control flow equivalent sets, sorted in
35 /// dominance order. Each set of control flow equivalent candidates is then
36 /// traversed, attempting to fuse pairs of candidates in the set. If all
37 /// requirements for fusion are met, the two candidates are fused, creating a
38 /// new (fused) candidate which is then added back into the set to consider for
39 /// additional fusion.
40 ///
41 /// This implementation currently does not make any modifications to remove
42 /// conditions for fusion. Code transformations to make loops conform to each of
43 /// the conditions for fusion are discussed in more detail in the document
44 /// above. These can be added to the current implementation in the future.
45 //===----------------------------------------------------------------------===//
46 
48 #include "llvm/ADT/Statistic.h"
51 #include "llvm/Analysis/LoopInfo.h"
56 #include "llvm/IR/Function.h"
57 #include "llvm/IR/Verifier.h"
58 #include "llvm/Pass.h"
59 #include "llvm/Support/Debug.h"
61 #include "llvm/Transforms/Scalar.h"
62 #include "llvm/Transforms/Utils.h"
64 
65 using namespace llvm;
66 
67 #define DEBUG_TYPE "loop-fusion"
68 
69 STATISTIC(FuseCounter, "Loops fused");
70 STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");
71 STATISTIC(InvalidPreheader, "Loop has invalid preheader");
72 STATISTIC(InvalidHeader, "Loop has invalid header");
73 STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks");
74 STATISTIC(InvalidExitBlock, "Loop has invalid exit block");
75 STATISTIC(InvalidLatch, "Loop has invalid latch");
76 STATISTIC(InvalidLoop, "Loop is invalid");
77 STATISTIC(AddressTakenBB, "Basic block has address taken");
78 STATISTIC(MayThrowException, "Loop may throw an exception");
79 STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");
80 STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");
81 STATISTIC(InvalidDependencies, "Dependencies prevent fusion");
82 STATISTIC(UnknownTripCount, "Loop has unknown trip count");
83 STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
84 STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
85 STATISTIC(NonAdjacent, "Loops are not adjacent");
86 STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader");
87 STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
88 
93 };
94 
96  "loop-fusion-dependence-analysis",
97  cl::desc("Which dependence analysis should loop fusion use?"),
99  "Use the scalar evolution interface"),
101  "Use the dependence analysis interface"),
103  "Use all available analyses")),
105 
106 #ifndef NDEBUG
107 static cl::opt<bool>
108  VerboseFusionDebugging("loop-fusion-verbose-debug",
109  cl::desc("Enable verbose debugging for Loop Fusion"),
111 #endif
112 
113 namespace {
114 /// This class is used to represent a candidate for loop fusion. When it is
115 /// constructed, it checks the conditions for loop fusion to ensure that it
116 /// represents a valid candidate. It caches several parts of a loop that are
117 /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead
118 /// of continually querying the underlying Loop to retrieve these values. It is
119 /// assumed these will not change throughout loop fusion.
120 ///
121 /// The invalidate method should be used to indicate that the FusionCandidate is
122 /// no longer a valid candidate for fusion. Similarly, the isValid() method can
123 /// be used to ensure that the FusionCandidate is still valid for fusion.
124 struct FusionCandidate {
125  /// Cache of parts of the loop used throughout loop fusion. These should not
126  /// need to change throughout the analysis and transformation.
127  /// These parts are cached to avoid repeatedly looking up in the Loop class.
128 
129  /// Preheader of the loop this candidate represents
130  BasicBlock *Preheader;
131  /// Header of the loop this candidate represents
132  BasicBlock *Header;
133  /// Blocks in the loop that exit the loop
134  BasicBlock *ExitingBlock;
135  /// The successor block of this loop (where the exiting blocks go to)
136  BasicBlock *ExitBlock;
137  /// Latch of the loop
138  BasicBlock *Latch;
139  /// The loop that this fusion candidate represents
140  Loop *L;
141  /// Vector of instructions in this loop that read from memory
143  /// Vector of instructions in this loop that write to memory
145  /// Are all of the members of this fusion candidate still valid
146  bool Valid;
147 
148  /// Dominator and PostDominator trees are needed for the
149  /// FusionCandidateCompare function, required by FusionCandidateSet to
150  /// determine where the FusionCandidate should be inserted into the set. These
151  /// are used to establish ordering of the FusionCandidates based on dominance.
152  const DominatorTree *DT;
153  const PostDominatorTree *PDT;
154 
156 
157  FusionCandidate(Loop *L, const DominatorTree *DT,
159  : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
160  ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
161  Latch(L->getLoopLatch()), L(L), Valid(true), DT(DT), PDT(PDT),
162  ORE(ORE) {
163 
164  // Walk over all blocks in the loop and check for conditions that may
165  // prevent fusion. For each block, walk over all instructions and collect
166  // the memory reads and writes If any instructions that prevent fusion are
167  // found, invalidate this object and return.
168  for (BasicBlock *BB : L->blocks()) {
169  if (BB->hasAddressTaken()) {
170  invalidate();
171  reportInvalidCandidate(AddressTakenBB);
172  return;
173  }
174 
175  for (Instruction &I : *BB) {
176  if (I.mayThrow()) {
177  invalidate();
178  reportInvalidCandidate(MayThrowException);
179  return;
180  }
181  if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
182  if (SI->isVolatile()) {
183  invalidate();
184  reportInvalidCandidate(ContainsVolatileAccess);
185  return;
186  }
187  }
188  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
189  if (LI->isVolatile()) {
190  invalidate();
191  reportInvalidCandidate(ContainsVolatileAccess);
192  return;
193  }
194  }
195  if (I.mayWriteToMemory())
196  MemWrites.push_back(&I);
197  if (I.mayReadFromMemory())
198  MemReads.push_back(&I);
199  }
200  }
201  }
202 
203  /// Check if all members of the class are valid.
204  bool isValid() const {
205  return Preheader && Header && ExitingBlock && ExitBlock && Latch && L &&
206  !L->isInvalid() && Valid;
207  }
208 
209  /// Verify that all members are in sync with the Loop object.
210  void verify() const {
211  assert(isValid() && "Candidate is not valid!!");
212  assert(!L->isInvalid() && "Loop is invalid!");
213  assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync");
214  assert(Header == L->getHeader() && "Header is out of sync");
215  assert(ExitingBlock == L->getExitingBlock() &&
216  "Exiting Blocks is out of sync");
217  assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync");
218  assert(Latch == L->getLoopLatch() && "Latch is out of sync");
219  }
220 
221 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
222  LLVM_DUMP_METHOD void dump() const {
223  dbgs() << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
224  << "\n"
225  << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
226  << "\tExitingBB: "
227  << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
228  << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
229  << "\n"
230  << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n";
231  }
232 #endif
233 
234  /// Determine if a fusion candidate (representing a loop) is eligible for
235  /// fusion. Note that this only checks whether a single loop can be fused - it
236  /// does not check whether it is *legal* to fuse two loops together.
237  bool isEligibleForFusion(ScalarEvolution &SE) const {
238  if (!isValid()) {
239  LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
240  if (!Preheader)
241  ++InvalidPreheader;
242  if (!Header)
243  ++InvalidHeader;
244  if (!ExitingBlock)
245  ++InvalidExitingBlock;
246  if (!ExitBlock)
247  ++InvalidExitBlock;
248  if (!Latch)
249  ++InvalidLatch;
250  if (L->isInvalid())
251  ++InvalidLoop;
252 
253  return false;
254  }
255 
256  // Require ScalarEvolution to be able to determine a trip count.
258  LLVM_DEBUG(dbgs() << "Loop " << L->getName()
259  << " trip count not computable!\n");
260  return reportInvalidCandidate(UnknownTripCount);
261  }
262 
263  if (!L->isLoopSimplifyForm()) {
264  LLVM_DEBUG(dbgs() << "Loop " << L->getName()
265  << " is not in simplified form!\n");
266  return reportInvalidCandidate(NotSimplifiedForm);
267  }
268 
269  return true;
270  }
271 
272 private:
273  // This is only used internally for now, to clear the MemWrites and MemReads
274  // list and setting Valid to false. I can't envision other uses of this right
275  // now, since once FusionCandidates are put into the FusionCandidateSet they
276  // are immutable. Thus, any time we need to change/update a FusionCandidate,
277  // we must create a new one and insert it into the FusionCandidateSet to
278  // ensure the FusionCandidateSet remains ordered correctly.
279  void invalidate() {
280  MemWrites.clear();
281  MemReads.clear();
282  Valid = false;
283  }
284 
285  bool reportInvalidCandidate(llvm::Statistic &Stat) const {
286  using namespace ore;
287  assert(L && Preheader && "Fusion candidate not initialized properly!");
288  ++Stat;
290  L->getStartLoc(), Preheader)
291  << "[" << Preheader->getParent()->getName() << "]: "
292  << "Loop is not a candidate for fusion: " << Stat.getDesc());
293  return false;
294  }
295 };
296 
297 struct FusionCandidateCompare {
298  /// Comparison functor to sort two Control Flow Equivalent fusion candidates
299  /// into dominance order.
300  /// If LHS dominates RHS and RHS post-dominates LHS, return true;
301  /// IF RHS dominates LHS and LHS post-dominates RHS, return false;
302  bool operator()(const FusionCandidate &LHS,
303  const FusionCandidate &RHS) const {
304  const DominatorTree *DT = LHS.DT;
305 
306  // Do not save PDT to local variable as it is only used in asserts and thus
307  // will trigger an unused variable warning if building without asserts.
308  assert(DT && LHS.PDT && "Expecting valid dominator tree");
309 
310  // Do this compare first so if LHS == RHS, function returns false.
311  if (DT->dominates(RHS.Preheader, LHS.Preheader)) {
312  // RHS dominates LHS
313  // Verify LHS post-dominates RHS
314  assert(LHS.PDT->dominates(LHS.Preheader, RHS.Preheader));
315  return false;
316  }
317 
318  if (DT->dominates(LHS.Preheader, RHS.Preheader)) {
319  // Verify RHS Postdominates LHS
320  assert(LHS.PDT->dominates(RHS.Preheader, LHS.Preheader));
321  return true;
322  }
323 
324  // If LHS does not dominate RHS and RHS does not dominate LHS then there is
325  // no dominance relationship between the two FusionCandidates. Thus, they
326  // should not be in the same set together.
328  "No dominance relationship between these fusion candidates!");
329  }
330 };
331 
332 using LoopVector = SmallVector<Loop *, 4>;
333 
334 // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance
335 // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0
336 // dominates FC1 and FC1 post-dominates FC0.
337 // std::set was chosen because we want a sorted data structure with stable
338 // iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent
339 // loops by moving intervening code around. When this intervening code contains
340 // loops, those loops will be moved also. The corresponding FusionCandidates
341 // will also need to be moved accordingly. As this is done, having stable
342 // iterators will simplify the logic. Similarly, having an efficient insert that
343 // keeps the FusionCandidateSet sorted will also simplify the implementation.
344 using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
345 using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
346 
347 #if !defined(NDEBUG)
349  const FusionCandidate &FC) {
350  if (FC.isValid())
351  OS << FC.Preheader->getName();
352  else
353  OS << "<Invalid>";
354 
355  return OS;
356 }
357 
359  const FusionCandidateSet &CandSet) {
360  for (const FusionCandidate &FC : CandSet)
361  OS << FC << '\n';
362 
363  return OS;
364 }
365 
366 static void
367 printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
368  dbgs() << "Fusion Candidates: \n";
369  for (const auto &CandidateSet : FusionCandidates) {
370  dbgs() << "*** Fusion Candidate Set ***\n";
371  dbgs() << CandidateSet;
372  dbgs() << "****************************\n";
373  }
374 }
375 #endif
376 
377 /// Collect all loops in function at the same nest level, starting at the
378 /// outermost level.
379 ///
380 /// This data structure collects all loops at the same nest level for a
381 /// given function (specified by the LoopInfo object). It starts at the
382 /// outermost level.
383 struct LoopDepthTree {
384  using LoopsOnLevelTy = SmallVector<LoopVector, 4>;
385  using iterator = LoopsOnLevelTy::iterator;
386  using const_iterator = LoopsOnLevelTy::const_iterator;
387 
388  LoopDepthTree(LoopInfo &LI) : Depth(1) {
389  if (!LI.empty())
390  LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend()));
391  }
392 
393  /// Test whether a given loop has been removed from the function, and thus is
394  /// no longer valid.
395  bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); }
396 
397  /// Record that a given loop has been removed from the function and is no
398  /// longer valid.
399  void removeLoop(const Loop *L) { RemovedLoops.insert(L); }
400 
401  /// Descend the tree to the next (inner) nesting level
402  void descend() {
403  LoopsOnLevelTy LoopsOnNextLevel;
404 
405  for (const LoopVector &LV : *this)
406  for (Loop *L : LV)
407  if (!isRemovedLoop(L) && L->begin() != L->end())
408  LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end()));
409 
410  LoopsOnLevel = LoopsOnNextLevel;
411  RemovedLoops.clear();
412  Depth++;
413  }
414 
415  bool empty() const { return size() == 0; }
416  size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); }
417  unsigned getDepth() const { return Depth; }
418 
419  iterator begin() { return LoopsOnLevel.begin(); }
420  iterator end() { return LoopsOnLevel.end(); }
421  const_iterator begin() const { return LoopsOnLevel.begin(); }
422  const_iterator end() const { return LoopsOnLevel.end(); }
423 
424 private:
425  /// Set of loops that have been removed from the function and are no longer
426  /// valid.
427  SmallPtrSet<const Loop *, 8> RemovedLoops;
428 
429  /// Depth of the current level, starting at 1 (outermost loops).
430  unsigned Depth;
431 
432  /// Vector of loops at the current depth level that have the same parent loop
433  LoopsOnLevelTy LoopsOnLevel;
434 };
435 
436 #ifndef NDEBUG
437 static void printLoopVector(const LoopVector &LV) {
438  dbgs() << "****************************\n";
439  for (auto L : LV)
440  printLoop(*L, dbgs());
441  dbgs() << "****************************\n";
442 }
443 #endif
444 
445 struct LoopFuser {
446 private:
447  // Sets of control flow equivalent fusion candidates for a given nest level.
448  FusionCandidateCollection FusionCandidates;
449 
450  LoopDepthTree LDT;
451  DomTreeUpdater DTU;
452 
453  LoopInfo &LI;
454  DominatorTree &DT;
455  DependenceInfo &DI;
456  ScalarEvolution &SE;
457  PostDominatorTree &PDT;
459 
460 public:
461  LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
463  OptimizationRemarkEmitter &ORE, const DataLayout &DL)
464  : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
465  DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE) {}
466 
467  /// This is the main entry point for loop fusion. It will traverse the
468  /// specified function and collect candidate loops to fuse, starting at the
469  /// outermost nesting level and working inwards.
470  bool fuseLoops(Function &F) {
471 #ifndef NDEBUG
473  LI.print(dbgs());
474  }
475 #endif
476 
477  LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName()
478  << "\n");
479  bool Changed = false;
480 
481  while (!LDT.empty()) {
482  LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth "
483  << LDT.getDepth() << "\n";);
484 
485  for (const LoopVector &LV : LDT) {
486  assert(LV.size() > 0 && "Empty loop set was build!");
487 
488  // Skip singleton loop sets as they do not offer fusion opportunities on
489  // this level.
490  if (LV.size() == 1)
491  continue;
492 #ifndef NDEBUG
494  LLVM_DEBUG({
495  dbgs() << " Visit loop set (#" << LV.size() << "):\n";
496  printLoopVector(LV);
497  });
498  }
499 #endif
500 
501  collectFusionCandidates(LV);
502  Changed |= fuseCandidates();
503  }
504 
505  // Finished analyzing candidates at this level.
506  // Descend to the next level and clear all of the candidates currently
507  // collected. Note that it will not be possible to fuse any of the
508  // existing candidates with new candidates because the new candidates will
509  // be at a different nest level and thus not be control flow equivalent
510  // with all of the candidates collected so far.
511  LLVM_DEBUG(dbgs() << "Descend one level!\n");
512  LDT.descend();
513  FusionCandidates.clear();
514  }
515 
516  if (Changed)
517  LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump(););
518 
519 #ifndef NDEBUG
520  assert(DT.verify());
521  assert(PDT.verify());
522  LI.verify(DT);
523  SE.verify();
524 #endif
525 
526  LLVM_DEBUG(dbgs() << "Loop Fusion complete\n");
527  return Changed;
528  }
529 
530 private:
531  /// Determine if two fusion candidates are control flow equivalent.
532  ///
533  /// Two fusion candidates are control flow equivalent if when one executes,
534  /// the other is guaranteed to execute. This is determined using dominators
535  /// and post-dominators: if A dominates B and B post-dominates A then A and B
536  /// are control-flow equivalent.
537  bool isControlFlowEquivalent(const FusionCandidate &FC0,
538  const FusionCandidate &FC1) const {
539  assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
540 
541  if (DT.dominates(FC0.Preheader, FC1.Preheader))
542  return PDT.dominates(FC1.Preheader, FC0.Preheader);
543 
544  if (DT.dominates(FC1.Preheader, FC0.Preheader))
545  return PDT.dominates(FC0.Preheader, FC1.Preheader);
546 
547  return false;
548  }
549 
550  /// Iterate over all loops in the given loop set and identify the loops that
551  /// are eligible for fusion. Place all eligible fusion candidates into Control
552  /// Flow Equivalent sets, sorted by dominance.
553  void collectFusionCandidates(const LoopVector &LV) {
554  for (Loop *L : LV) {
555  FusionCandidate CurrCand(L, &DT, &PDT, ORE);
556  if (!CurrCand.isEligibleForFusion(SE))
557  continue;
558 
559  // Go through each list in FusionCandidates and determine if L is control
560  // flow equivalent with the first loop in that list. If it is, append LV.
561  // If not, go to the next list.
562  // If no suitable list is found, start another list and add it to
563  // FusionCandidates.
564  bool FoundSet = false;
565 
566  for (auto &CurrCandSet : FusionCandidates) {
567  if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) {
568  CurrCandSet.insert(CurrCand);
569  FoundSet = true;
570 #ifndef NDEBUG
572  LLVM_DEBUG(dbgs() << "Adding " << CurrCand
573  << " to existing candidate set\n");
574 #endif
575  break;
576  }
577  }
578  if (!FoundSet) {
579  // No set was found. Create a new set and add to FusionCandidates
580 #ifndef NDEBUG
582  LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n");
583 #endif
584  FusionCandidateSet NewCandSet;
585  NewCandSet.insert(CurrCand);
586  FusionCandidates.push_back(NewCandSet);
587  }
588  NumFusionCandidates++;
589  }
590  }
591 
592  /// Determine if it is beneficial to fuse two loops.
593  ///
594  /// For now, this method simply returns true because we want to fuse as much
595  /// as possible (primarily to test the pass). This method will evolve, over
596  /// time, to add heuristics for profitability of fusion.
597  bool isBeneficialFusion(const FusionCandidate &FC0,
598  const FusionCandidate &FC1) {
599  return true;
600  }
601 
602  /// Determine if two fusion candidates have the same trip count (i.e., they
603  /// execute the same number of iterations).
604  ///
605  /// Note that for now this method simply returns a boolean value because there
606  /// are no mechanisms in loop fusion to handle different trip counts. In the
607  /// future, this behaviour can be extended to adjust one of the loops to make
608  /// the trip counts equal (e.g., loop peeling). When this is added, this
609  /// interface may need to change to return more information than just a
610  /// boolean value.
611  bool identicalTripCounts(const FusionCandidate &FC0,
612  const FusionCandidate &FC1) const {
613  const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
614  if (isa<SCEVCouldNotCompute>(TripCount0)) {
615  UncomputableTripCount++;
616  LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
617  return false;
618  }
619 
620  const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
621  if (isa<SCEVCouldNotCompute>(TripCount1)) {
622  UncomputableTripCount++;
623  LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
624  return false;
625  }
626  LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
627  << *TripCount1 << " are "
628  << (TripCount0 == TripCount1 ? "identical" : "different")
629  << "\n");
630 
631  return (TripCount0 == TripCount1);
632  }
633 
634  /// Walk each set of control flow equivalent fusion candidates and attempt to
635  /// fuse them. This does a single linear traversal of all candidates in the
636  /// set. The conditions for legal fusion are checked at this point. If a pair
637  /// of fusion candidates passes all legality checks, they are fused together
638  /// and a new fusion candidate is created and added to the FusionCandidateSet.
639  /// The original fusion candidates are then removed, as they are no longer
640  /// valid.
641  bool fuseCandidates() {
642  bool Fused = false;
643  LLVM_DEBUG(printFusionCandidates(FusionCandidates));
644  for (auto &CandidateSet : FusionCandidates) {
645  if (CandidateSet.size() < 2)
646  continue;
647 
648  LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n"
649  << CandidateSet << "\n");
650 
651  for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) {
652  assert(!LDT.isRemovedLoop(FC0->L) &&
653  "Should not have removed loops in CandidateSet!");
654  auto FC1 = FC0;
655  for (++FC1; FC1 != CandidateSet.end(); ++FC1) {
656  assert(!LDT.isRemovedLoop(FC1->L) &&
657  "Should not have removed loops in CandidateSet!");
658 
659  LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump();
660  dbgs() << " with\n"; FC1->dump(); dbgs() << "\n");
661 
662  FC0->verify();
663  FC1->verify();
664 
665  if (!identicalTripCounts(*FC0, *FC1)) {
666  LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
667  "counts. Not fusing.\n");
668  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
669  NonEqualTripCount);
670  continue;
671  }
672 
673  if (!isAdjacent(*FC0, *FC1)) {
674  LLVM_DEBUG(dbgs()
675  << "Fusion candidates are not adjacent. Not fusing.\n");
676  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);
677  continue;
678  }
679 
680  // For now we skip fusing if the second candidate has any instructions
681  // in the preheader. This is done because we currently do not have the
682  // safety checks to determine if it is save to move the preheader of
683  // the second candidate past the body of the first candidate. Once
684  // these checks are added, this condition can be removed.
685  if (!isEmptyPreheader(*FC1)) {
686  LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "
687  "preheader. Not fusing.\n");
688  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
689  NonEmptyPreheader);
690  continue;
691  }
692 
693  if (!dependencesAllowFusion(*FC0, *FC1)) {
694  LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
695  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
696  InvalidDependencies);
697  continue;
698  }
699 
700  bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1);
701  LLVM_DEBUG(dbgs()
702  << "\tFusion appears to be "
703  << (BeneficialToFuse ? "" : "un") << "profitable!\n");
704  if (!BeneficialToFuse) {
705  reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
706  FusionNotBeneficial);
707  continue;
708  }
709  // All analysis has completed and has determined that fusion is legal
710  // and profitable. At this point, start transforming the code and
711  // perform fusion.
712 
713  LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
714  << *FC1 << "\n");
715 
716  // Report fusion to the Optimization Remarks.
717  // Note this needs to be done *before* performFusion because
718  // performFusion will change the original loops, making it not
719  // possible to identify them after fusion is complete.
720  reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter);
721 
722  FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE);
723  FusedCand.verify();
724  assert(FusedCand.isEligibleForFusion(SE) &&
725  "Fused candidate should be eligible for fusion!");
726 
727  // Notify the loop-depth-tree that these loops are not valid objects
728  LDT.removeLoop(FC1->L);
729 
730  CandidateSet.erase(FC0);
731  CandidateSet.erase(FC1);
732 
733  auto InsertPos = CandidateSet.insert(FusedCand);
734 
735  assert(InsertPos.second &&
736  "Unable to insert TargetCandidate in CandidateSet!");
737 
738  // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations
739  // of the FC1 loop will attempt to fuse the new (fused) loop with the
740  // remaining candidates in the current candidate set.
741  FC0 = FC1 = InsertPos.first;
742 
743  LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet
744  << "\n");
745 
746  Fused = true;
747  }
748  }
749  }
750  return Fused;
751  }
752 
753  /// Rewrite all additive recurrences in a SCEV to use a new loop.
754  class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> {
755  public:
756  AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL,
757  bool UseMax = true)
758  : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL),
759  NewL(NewL) {}
760 
761  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
762  const Loop *ExprL = Expr->getLoop();
764  if (ExprL == &OldL) {
765  Operands.append(Expr->op_begin(), Expr->op_end());
766  return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags());
767  }
768 
769  if (OldL.contains(ExprL)) {
770  bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE));
771  if (!UseMax || !Pos || !Expr->isAffine()) {
772  Valid = false;
773  return Expr;
774  }
775  return visit(Expr->getStart());
776  }
777 
778  for (const SCEV *Op : Expr->operands())
779  Operands.push_back(visit(Op));
780  return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags());
781  }
782 
783  bool wasValidSCEV() const { return Valid; }
784 
785  private:
786  bool Valid, UseMax;
787  const Loop &OldL, &NewL;
788  };
789 
790  /// Return false if the access functions of \p I0 and \p I1 could cause
791  /// a negative dependence.
792  bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0,
793  Instruction &I1, bool EqualIsInvalid) {
794  Value *Ptr0 = getLoadStorePointerOperand(&I0);
795  Value *Ptr1 = getLoadStorePointerOperand(&I1);
796  if (!Ptr0 || !Ptr1)
797  return false;
798 
799  const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0);
800  const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1);
801 #ifndef NDEBUG
803  LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs "
804  << *SCEVPtr1 << "\n");
805 #endif
806  AddRecLoopReplacer Rewriter(SE, L0, L1);
807  SCEVPtr0 = Rewriter.visit(SCEVPtr0);
808 #ifndef NDEBUG
810  LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0
811  << " [Valid: " << Rewriter.wasValidSCEV() << "]\n");
812 #endif
813  if (!Rewriter.wasValidSCEV())
814  return false;
815 
816  // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by
817  // L0) and the other is not. We could check if it is monotone and test
818  // the beginning and end value instead.
819 
820  BasicBlock *L0Header = L0.getHeader();
821  auto HasNonLinearDominanceRelation = [&](const SCEV *S) {
822  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S);
823  if (!AddRec)
824  return false;
825  return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) &&
826  !DT.dominates(AddRec->getLoop()->getHeader(), L0Header);
827  };
828  if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation))
829  return false;
830 
831  ICmpInst::Predicate Pred =
832  EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE;
833  bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1);
834 #ifndef NDEBUG
836  LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0
837  << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1
838  << "\n");
839 #endif
840  return IsAlwaysGE;
841  }
842 
843  /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in
844  /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses
845  /// specified by @p DepChoice are used to determine this.
846  bool dependencesAllowFusion(const FusionCandidate &FC0,
847  const FusionCandidate &FC1, Instruction &I0,
848  Instruction &I1, bool AnyDep,
849  FusionDependenceAnalysisChoice DepChoice) {
850 #ifndef NDEBUG
852  LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : "
853  << DepChoice << "\n");
854  }
855 #endif
856  switch (DepChoice) {
858  return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep);
860  auto DepResult = DI.depends(&I0, &I1, true);
861  if (!DepResult)
862  return true;
863 #ifndef NDEBUG
865  LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs());
866  dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: "
867  << (DepResult->isOrdered() ? "true" : "false")
868  << "]\n");
869  LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels()
870  << "\n");
871  }
872 #endif
873 
874  if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor())
875  LLVM_DEBUG(
876  dbgs() << "TODO: Implement pred/succ dependence handling!\n");
877 
878  // TODO: Can we actually use the dependence info analysis here?
879  return false;
880  }
881 
883  return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
885  dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
887  }
888 
889  llvm_unreachable("Unknown fusion dependence analysis choice!");
890  }
891 
892  /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused.
893  bool dependencesAllowFusion(const FusionCandidate &FC0,
894  const FusionCandidate &FC1) {
895  LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
896  << "\n");
897  assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
898  assert(DT.dominates(FC0.Preheader, FC1.Preheader));
899 
900  for (Instruction *WriteL0 : FC0.MemWrites) {
901  for (Instruction *WriteL1 : FC1.MemWrites)
902  if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
903  /* AnyDep */ false,
905  InvalidDependencies++;
906  return false;
907  }
908  for (Instruction *ReadL1 : FC1.MemReads)
909  if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1,
910  /* AnyDep */ false,
912  InvalidDependencies++;
913  return false;
914  }
915  }
916 
917  for (Instruction *WriteL1 : FC1.MemWrites) {
918  for (Instruction *WriteL0 : FC0.MemWrites)
919  if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
920  /* AnyDep */ false,
922  InvalidDependencies++;
923  return false;
924  }
925  for (Instruction *ReadL0 : FC0.MemReads)
926  if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1,
927  /* AnyDep */ false,
929  InvalidDependencies++;
930  return false;
931  }
932  }
933 
934  // Walk through all uses in FC1. For each use, find the reaching def. If the
935  // def is located in FC0 then it is is not safe to fuse.
936  for (BasicBlock *BB : FC1.L->blocks())
937  for (Instruction &I : *BB)
938  for (auto &Op : I.operands())
939  if (Instruction *Def = dyn_cast<Instruction>(Op))
940  if (FC0.L->contains(Def->getParent())) {
941  InvalidDependencies++;
942  return false;
943  }
944 
945  return true;
946  }
947 
948  /// Determine if the exit block of \p FC0 is the preheader of \p FC1. In this
949  /// case, there is no code in between the two fusion candidates, thus making
950  /// them adjacent.
951  bool isAdjacent(const FusionCandidate &FC0,
952  const FusionCandidate &FC1) const {
953  return FC0.ExitBlock == FC1.Preheader;
954  }
955 
956  bool isEmptyPreheader(const FusionCandidate &FC) const {
957  return FC.Preheader->size() == 1;
958  }
959 
960  /// Fuse two fusion candidates, creating a new fused loop.
961  ///
962  /// This method contains the mechanics of fusing two loops, represented by \p
963  /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1
964  /// postdominates \p FC0 (making them control flow equivalent). It also
965  /// assumes that the other conditions for fusion have been met: adjacent,
966  /// identical trip counts, and no negative distance dependencies exist that
967  /// would prevent fusion. Thus, there is no checking for these conditions in
968  /// this method.
969  ///
970  /// Fusion is performed by rewiring the CFG to update successor blocks of the
971  /// components of tho loop. Specifically, the following changes are done:
972  ///
973  /// 1. The preheader of \p FC1 is removed as it is no longer necessary
974  /// (because it is currently only a single statement block).
975  /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1.
976  /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0.
977  /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0.
978  ///
979  /// All of these modifications are done with dominator tree updates, thus
980  /// keeping the dominator (and post dominator) information up-to-date.
981  ///
982  /// This can be improved in the future by actually merging blocks during
983  /// fusion. For example, the preheader of \p FC1 can be merged with the
984  /// preheader of \p FC0. This would allow loops with more than a single
985  /// statement in the preheader to be fused. Similarly, the latch blocks of the
986  /// two loops could also be fused into a single block. This will require
987  /// analysis to prove it is safe to move the contents of the block past
988  /// existing code, which currently has not been implemented.
989  Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) {
990  assert(FC0.isValid() && FC1.isValid() &&
991  "Expecting valid fusion candidates");
992 
993  LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
994  dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
995 
996  assert(FC1.Preheader == FC0.ExitBlock);
997  assert(FC1.Preheader->size() == 1 &&
998  FC1.Preheader->getSingleSuccessor() == FC1.Header);
999 
1000  // Remember the phi nodes originally in the header of FC0 in order to rewire
1001  // them later. However, this is only necessary if the new loop carried
1002  // values might not dominate the exiting branch. While we do not generally
1003  // test if this is the case but simply insert intermediate phi nodes, we
1004  // need to make sure these intermediate phi nodes have different
1005  // predecessors. To this end, we filter the special case where the exiting
1006  // block is the latch block of the first loop. Nothing needs to be done
1007  // anyway as all loop carried values dominate the latch and thereby also the
1008  // exiting branch.
1009  SmallVector<PHINode *, 8> OriginalFC0PHIs;
1010  if (FC0.ExitingBlock != FC0.Latch)
1011  for (PHINode &PHI : FC0.Header->phis())
1012  OriginalFC0PHIs.push_back(&PHI);
1013 
1014  // Replace incoming blocks for header PHIs first.
1015  FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
1016  FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
1017 
1018  // Then modify the control flow and update DT and PDT.
1020 
1021  // The old exiting block of the first loop (FC0) has to jump to the header
1022  // of the second as we need to execute the code in the second header block
1023  // regardless of the trip count. That is, if the trip count is 0, so the
1024  // back edge is never taken, we still have to execute both loop headers,
1025  // especially (but not only!) if the second is a do-while style loop.
1026  // However, doing so might invalidate the phi nodes of the first loop as
1027  // the new values do only need to dominate their latch and not the exiting
1028  // predicate. To remedy this potential problem we always introduce phi
1029  // nodes in the header of the second loop later that select the loop carried
1030  // value, if the second header was reached through an old latch of the
1031  // first, or undef otherwise. This is sound as exiting the first implies the
1032  // second will exit too, __without__ taking the back-edge. [Their
1033  // trip-counts are equal after all.
1034  // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go
1035  // to FC1.Header? I think this is basically what the three sequences are
1036  // trying to accomplish; however, doing this directly in the CFG may mean
1037  // the DT/PDT becomes invalid
1038  FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
1039  FC1.Header);
1041  DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
1043  DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
1044 
1045  // The pre-header of L1 is not necessary anymore.
1046  assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader));
1047  FC1.Preheader->getTerminator()->eraseFromParent();
1048  new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
1050  DominatorTree::Delete, FC1.Preheader, FC1.Header));
1051 
1052  // Moves the phi nodes from the second to the first loops header block.
1053  while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
1054  if (SE.isSCEVable(PHI->getType()))
1055  SE.forgetValue(PHI);
1056  if (PHI->hasNUsesOrMore(1))
1057  PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
1058  else
1059  PHI->eraseFromParent();
1060  }
1061 
1062  // Introduce new phi nodes in the second loop header to ensure
1063  // exiting the first and jumping to the header of the second does not break
1064  // the SSA property of the phis originally in the first loop. See also the
1065  // comment above.
1066  Instruction *L1HeaderIP = &FC1.Header->front();
1067  for (PHINode *LCPHI : OriginalFC0PHIs) {
1068  int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
1069  assert(L1LatchBBIdx >= 0 &&
1070  "Expected loop carried value to be rewired at this point!");
1071 
1072  Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
1073 
1074  PHINode *L1HeaderPHI = PHINode::Create(
1075  LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
1076  L1HeaderPHI->addIncoming(LCV, FC0.Latch);
1077  L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
1078  FC0.ExitingBlock);
1079 
1080  LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
1081  }
1082 
1083  // Replace latch terminator destinations.
1084  FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
1085  FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
1086 
1087  // If FC0.Latch and FC0.ExitingBlock are the same then we have already
1088  // performed the updates above.
1089  if (FC0.Latch != FC0.ExitingBlock)
1091  DominatorTree::Insert, FC0.Latch, FC1.Header));
1092 
1094  FC0.Latch, FC0.Header));
1096  FC1.Latch, FC0.Header));
1098  FC1.Latch, FC1.Header));
1099 
1100  // Update DT/PDT
1101  DTU.applyUpdates(TreeUpdates);
1102 
1103  LI.removeBlock(FC1.Preheader);
1104  DTU.deleteBB(FC1.Preheader);
1105  DTU.flush();
1106 
1107  // Is there a way to keep SE up-to-date so we don't need to forget the loops
1108  // and rebuild the information in subsequent passes of fusion?
1109  SE.forgetLoop(FC1.L);
1110  SE.forgetLoop(FC0.L);
1111 
1112  // Merge the loops.
1113  SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(),
1114  FC1.L->block_end());
1115  for (BasicBlock *BB : Blocks) {
1116  FC0.L->addBlockEntry(BB);
1117  FC1.L->removeBlockFromLoop(BB);
1118  if (LI.getLoopFor(BB) != FC1.L)
1119  continue;
1120  LI.changeLoopFor(BB, FC0.L);
1121  }
1122  while (!FC1.L->empty()) {
1123  const auto &ChildLoopIt = FC1.L->begin();
1124  Loop *ChildLoop = *ChildLoopIt;
1125  FC1.L->removeChildLoop(ChildLoopIt);
1126  FC0.L->addChildLoop(ChildLoop);
1127  }
1128 
1129  // Delete the now empty loop L1.
1130  LI.erase(FC1.L);
1131 
1132 #ifndef NDEBUG
1133  assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
1135  assert(PDT.verify());
1136  LI.verify(DT);
1137  SE.verify();
1138 #endif
1139 
1140  FuseCounter++;
1141 
1142  LLVM_DEBUG(dbgs() << "Fusion done:\n");
1143 
1144  return FC0.L;
1145  }
1146 
1147  /// Report details on loop fusion opportunities.
1148  ///
1149  /// This template function can be used to report both successful and missed
1150  /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
1151  /// be one of:
1152  /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful
1153  /// given two valid fusion candidates.
1154  /// - OptimizationRemark to report successful fusion of two fusion
1155  /// candidates.
1156  /// The remarks will be printed using the form:
1157  /// <path/filename>:<line number>:<column number>: [<function name>]:
1158  /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
1159  template <typename RemarkKind>
1160  void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
1161  llvm::Statistic &Stat) {
1162  assert(FC0.Preheader && FC1.Preheader &&
1163  "Expecting valid fusion candidates");
1164  using namespace ore;
1165  ++Stat;
1166  ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),
1167  FC0.Preheader)
1168  << "[" << FC0.Preheader->getParent()->getName()
1169  << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))
1170  << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
1171  << ": " << Stat.getDesc());
1172  }
1173 };
1174 
1175 struct LoopFuseLegacy : public FunctionPass {
1176 
1177  static char ID;
1178 
1179  LoopFuseLegacy() : FunctionPass(ID) {
1181  }
1182 
1183  void getAnalysisUsage(AnalysisUsage &AU) const override {
1191 
1196  }
1197 
1198  bool runOnFunction(Function &F) override {
1199  if (skipFunction(F))
1200  return false;
1201  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1202  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1203  auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
1204  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1205  auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
1206  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1207 
1208  const DataLayout &DL = F.getParent()->getDataLayout();
1209  LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL);
1210  return LF.fuseLoops(F);
1211  }
1212 };
1213 } // namespace
1214 
1216  auto &LI = AM.getResult<LoopAnalysis>(F);
1217  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
1218  auto &DI = AM.getResult<DependenceAnalysis>(F);
1219  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
1220  auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
1222 
1223  const DataLayout &DL = F.getParent()->getDataLayout();
1224  LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL);
1225  bool Changed = LF.fuseLoops(F);
1226  if (!Changed)
1227  return PreservedAnalyses::all();
1228 
1229  PreservedAnalyses PA;
1233  PA.preserve<LoopAnalysis>();
1234  return PA;
1235 }
1236 
1237 char LoopFuseLegacy::ID = 0;
1238 
1239 INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false,
1240  false)
1247 INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false)
1248 
1249 FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); }
void initializeLoopFuseLegacyPass(PassRegistry &)
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:641
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
raw_ostream & errs()
This returns a reference to a raw_ostream for standard error.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:211
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
bool empty() const
Definition: LoopInfo.h:907
DiagnosticInfoOptimizationBase::Argument NV
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:776
AnalysisPass to compute dependence information in a function.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:476
Legacy pass manager pass to access dependence information.
The main scalar evolution driver.
This file implements the Loop Fusion pass.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:160
STATISTIC(NumFunctions, "Total number of functions")
void flush()
Apply all pending updates to available trees and flush all BasicBlocks awaiting deletion.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:230
F(f)
block Block Frequency true
An instruction for reading from memory.
Definition: Instructions.h:167
INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false) FunctionPass *llvm
Definition: LoopFuse.cpp:1239
DependenceInfo - This class is the main dependence-analysis driver.
bool isInvalid() const
Return true if this loop is no longer valid.
Definition: LoopInfo.h:198
void verify(const DominatorTreeBase< BlockT, false > &DomTree) const
Definition: LoopInfoImpl.h:683
static cl::opt< FusionDependenceAnalysisChoice > FusionDependenceAnalysis("loop-fusion-dependence-analysis", cl::desc("Which dependence analysis should loop fusion use?"), cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev", "Use the scalar evolution interface"), clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da", "Use the dependence analysis interface"), clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all", "Use all available analyses")), cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore)
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4428
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:928
Diagnostic information for optimization analysis remarks.
static cl::opt< bool > VerboseFusionDebugging("loop-fusion-verbose-debug", cl::desc("Enable verbose debugging for Loop Fusion"), cl::Hidden, cl::init(false), cl::ZeroOrMore)
void erase(Loop *L)
Update LoopInfo after removing the last backedge from a loop.
Definition: LoopInfo.cpp:853
friend const_iterator begin(StringRef path, Style style)
Get begin iterator over path.
Definition: Path.cpp:224
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:1183
mir Rename Register Operands
BlockT * getHeader() const
Definition: LoopInfo.h:105
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS...
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
This node represents a polynomial recurrence on the trip count of the specified loop.
An instruction for storing to memory.
Definition: Instructions.h:320
op_iterator op_begin() const
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
static bool runOnFunction(Function &F, bool PostInlining)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
friend const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool PossiblyLoopIndependent)
depends - Tests for a dependence between the Src and Dst instructions.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:153
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
void deleteBB(BasicBlock *DelBB)
Delete DelBB.
This function has undefined behavior.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:652
Interval::pred_iterator pred_begin(Interval *I)
pred_begin/pred_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:112
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopFuse.cpp:1215
const SCEV * getSCEVAtScope(const SCEV *S, const Loop *L)
Return a SCEV expression for the specified value at the specified scope in the program.
Represent the analysis usage information of a pass.
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:42
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:732
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
Interval::pred_iterator pred_end(Interval *I)
Definition: Interval.h:115
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values...
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:75
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1446
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:159
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:607
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool verify(const TargetRegisterInfo &TRI) const
Check that information hold by this instance make sense for the given TRI.
signed greater than
Definition: InstrTypes.h:759
char & LoopSimplifyID
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:197
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
void print(raw_ostream &OS) const
Definition: LoopInfoImpl.h:612
auto size(R &&Range, typename std::enable_if< std::is_same< typename std::iterator_traits< decltype(Range.begin())>::iterator_category, std::random_access_iterator_tag >::value, void >::type *=nullptr) -> decltype(std::distance(Range.begin(), Range.end()))
Get the size of a range.
Definition: STLExtras.h:1146
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file. ...
bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
void applyUpdates(ArrayRef< DominatorTree::UpdateType > Updates)
Submit updates to all available trees.
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:314
reverse_iterator rend() const
Definition: LoopInfo.h:906
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:248
Analysis pass which computes a PostDominatorTree.
static unsigned getDepth(const SmallVectorImpl< const MachineBasicBlock *> &Stack, const MachineBasicBlock *MBB)
iterator begin() const
Definition: LoopInfo.h:147
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
#define DEBUG_TYPE
Definition: LoopFuse.cpp:67
const char * getDesc() const
Definition: Statistic.h:58
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:387
reverse_iterator rbegin() const
Definition: LoopInfo.h:905
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:627
FunctionPass * createLoopFusePass()
Analysis pass that exposes the ScalarEvolution for a function.
Virtual Register Rewriter
Definition: VirtRegMap.cpp:220
bool isLoopSimplifyForm() const
Return true if the Loop is in the form that the LoopSimplify form transforms loops to...
Definition: LoopInfo.cpp:460
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
This class represents an analyzed expression in the program.
StringRef getName() const
Definition: LoopInfo.h:827
const char * getName() const
Definition: Statistic.h:57
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:509
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
#define I(x, y, z)
Definition: MD5.cpp:58
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:5054
iterator end() const
Definition: LoopInfo.h:148
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
void changeLoopFor(BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
Definition: LoopInfo.h:960
void preserve()
Mark an analysis as preserved.
Definition: PassManager.h:174
FusionDependenceAnalysisChoice
Definition: LoopFuse.cpp:89
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
Definition: APInt.h:2045
const SCEV * getBackedgeTakenCount(const Loop *L)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
LLVM Value Representation.
Definition: Value.h:73
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
OptimizationRemarkEmitter legacy analysis pass.
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:45
The legacy pass manager&#39;s analysis pass to compute loop information.
Definition: LoopInfo.h:1208
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
A container for analyses that lazily runs them and caches their results.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:259
#define LLVM_DEBUG(X)
Definition: Debug.h:122
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:161
void printLoop(Loop &L, raw_ostream &OS, const std::string &Banner="")
Function to print a loop&#39;s contents as LLVM&#39;s text IR assembly.
Definition: LoopInfo.cpp:932
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:49
NoWrapFlags getNoWrapFlags(NoWrapFlags Mask=NoWrapMask) const
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count...
The optimization diagnostic interface.
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
Definition: LoopInfo.h:987
This visitor recursively visits a SCEV expression and re-writes it.
signed greater or equal
Definition: InstrTypes.h:760