LLVM  6.0.0svn
LoopInterchange.cpp
Go to the documentation of this file.
1 //===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This Pass handles loop interchange transform.
11 // This pass interchanges loops to provide a more cache-friendly memory access
12 // patterns.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/StringRef.h"
21 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/IR/BasicBlock.h"
26 #include "llvm/IR/Constants.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/InstrTypes.h"
31 #include "llvm/IR/Instruction.h"
32 #include "llvm/IR/Instructions.h"
33 #include "llvm/IR/Type.h"
34 #include "llvm/IR/User.h"
35 #include "llvm/IR/Value.h"
36 #include "llvm/Pass.h"
37 #include "llvm/Support/Casting.h"
39 #include "llvm/Support/Debug.h"
42 #include "llvm/Transforms/Scalar.h"
45 #include <cassert>
46 #include <utility>
47 #include <vector>
48 
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "loop-interchange"
52 
54  "loop-interchange-threshold", cl::init(0), cl::Hidden,
55  cl::desc("Interchange if you gain more than this number"));
56 
57 namespace {
58 
59 using LoopVector = SmallVector<Loop *, 8>;
60 
61 // TODO: Check if we can use a sparse matrix here.
62 using CharMatrix = std::vector<std::vector<char>>;
63 
64 } // end anonymous namespace
65 
66 // Maximum number of dependencies that can be handled in the dependency matrix.
67 static const unsigned MaxMemInstrCount = 100;
68 
69 // Maximum loop depth supported.
70 static const unsigned MaxLoopNestDepth = 10;
71 
72 #ifdef DUMP_DEP_MATRICIES
73 static void printDepMatrix(CharMatrix &DepMatrix) {
74  for (auto &Row : DepMatrix) {
75  for (auto D : Row)
76  DEBUG(dbgs() << D << " ");
77  DEBUG(dbgs() << "\n");
78  }
79 }
80 #endif
81 
82 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
83  Loop *L, DependenceInfo *DI) {
84  using ValueVector = SmallVector<Value *, 16>;
85 
86  ValueVector MemInstr;
87 
88  // For each block.
89  for (BasicBlock *BB : L->blocks()) {
90  // Scan the BB and collect legal loads and stores.
91  for (Instruction &I : *BB) {
92  if (!isa<Instruction>(I))
93  return false;
94  if (auto *Ld = dyn_cast<LoadInst>(&I)) {
95  if (!Ld->isSimple())
96  return false;
97  MemInstr.push_back(&I);
98  } else if (auto *St = dyn_cast<StoreInst>(&I)) {
99  if (!St->isSimple())
100  return false;
101  MemInstr.push_back(&I);
102  }
103  }
104  }
105 
106  DEBUG(dbgs() << "Found " << MemInstr.size()
107  << " Loads and Stores to analyze\n");
108 
109  ValueVector::iterator I, IE, J, JE;
110 
111  for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
112  for (J = I, JE = MemInstr.end(); J != JE; ++J) {
113  std::vector<char> Dep;
114  Instruction *Src = cast<Instruction>(*I);
115  Instruction *Dst = cast<Instruction>(*J);
116  if (Src == Dst)
117  continue;
118  // Ignore Input dependencies.
119  if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
120  continue;
121  // Track Output, Flow, and Anti dependencies.
122  if (auto D = DI->depends(Src, Dst, true)) {
123  assert(D->isOrdered() && "Expected an output, flow or anti dep.");
124  DEBUG(StringRef DepType =
125  D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
126  dbgs() << "Found " << DepType
127  << " dependency between Src and Dst\n"
128  << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
129  unsigned Levels = D->getLevels();
130  char Direction;
131  for (unsigned II = 1; II <= Levels; ++II) {
132  const SCEV *Distance = D->getDistance(II);
133  const SCEVConstant *SCEVConst =
134  dyn_cast_or_null<SCEVConstant>(Distance);
135  if (SCEVConst) {
136  const ConstantInt *CI = SCEVConst->getValue();
137  if (CI->isNegative())
138  Direction = '<';
139  else if (CI->isZero())
140  Direction = '=';
141  else
142  Direction = '>';
143  Dep.push_back(Direction);
144  } else if (D->isScalar(II)) {
145  Direction = 'S';
146  Dep.push_back(Direction);
147  } else {
148  unsigned Dir = D->getDirection(II);
149  if (Dir == Dependence::DVEntry::LT ||
151  Direction = '<';
152  else if (Dir == Dependence::DVEntry::GT ||
154  Direction = '>';
155  else if (Dir == Dependence::DVEntry::EQ)
156  Direction = '=';
157  else
158  Direction = '*';
159  Dep.push_back(Direction);
160  }
161  }
162  while (Dep.size() != Level) {
163  Dep.push_back('I');
164  }
165 
166  DepMatrix.push_back(Dep);
167  if (DepMatrix.size() > MaxMemInstrCount) {
168  DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
169  << " dependencies inside loop\n");
170  return false;
171  }
172  }
173  }
174  }
175 
176  // We don't have a DepMatrix to check legality return false.
177  if (DepMatrix.empty())
178  return false;
179  return true;
180 }
181 
182 // A loop is moved from index 'from' to an index 'to'. Update the Dependence
183 // matrix by exchanging the two columns.
184 static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
185  unsigned ToIndx) {
186  unsigned numRows = DepMatrix.size();
187  for (unsigned i = 0; i < numRows; ++i) {
188  char TmpVal = DepMatrix[i][ToIndx];
189  DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx];
190  DepMatrix[i][FromIndx] = TmpVal;
191  }
192 }
193 
194 // Checks if outermost non '=','S'or'I' dependence in the dependence matrix is
195 // '>'
196 static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
197  unsigned Column) {
198  for (unsigned i = 0; i <= Column; ++i) {
199  if (DepMatrix[Row][i] == '<')
200  return false;
201  if (DepMatrix[Row][i] == '>')
202  return true;
203  }
204  // All dependencies were '=','S' or 'I'
205  return false;
206 }
207 
208 // Checks if no dependence exist in the dependency matrix in Row before Column.
209 static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
210  unsigned Column) {
211  for (unsigned i = 0; i < Column; ++i) {
212  if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' &&
213  DepMatrix[Row][i] != 'I')
214  return false;
215  }
216  return true;
217 }
218 
219 static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
220  unsigned OuterLoopId, char InnerDep,
221  char OuterDep) {
222  if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
223  return false;
224 
225  if (InnerDep == OuterDep)
226  return true;
227 
228  // It is legal to interchange if and only if after interchange no row has a
229  // '>' direction as the leftmost non-'='.
230 
231  if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I')
232  return true;
233 
234  if (InnerDep == '<')
235  return true;
236 
237  if (InnerDep == '>') {
238  // If OuterLoopId represents outermost loop then interchanging will make the
239  // 1st dependency as '>'
240  if (OuterLoopId == 0)
241  return false;
242 
243  // If all dependencies before OuterloopId are '=','S'or 'I'. Then
244  // interchanging will result in this row having an outermost non '='
245  // dependency of '>'
246  if (!containsNoDependence(DepMatrix, Row, OuterLoopId))
247  return true;
248  }
249 
250  return false;
251 }
252 
253 // Checks if it is legal to interchange 2 loops.
254 // [Theorem] A permutation of the loops in a perfect nest is legal if and only
255 // if the direction matrix, after the same permutation is applied to its
256 // columns, has no ">" direction as the leftmost non-"=" direction in any row.
257 static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
258  unsigned InnerLoopId,
259  unsigned OuterLoopId) {
260  unsigned NumRows = DepMatrix.size();
261  // For each row check if it is valid to interchange.
262  for (unsigned Row = 0; Row < NumRows; ++Row) {
263  char InnerDep = DepMatrix[Row][InnerLoopId];
264  char OuterDep = DepMatrix[Row][OuterLoopId];
265  if (InnerDep == '*' || OuterDep == '*')
266  return false;
267  if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep))
268  return false;
269  }
270  return true;
271 }
272 
274  DEBUG(dbgs() << "Calling populateWorklist on Func: "
275  << L.getHeader()->getParent()->getName() << " Loop: %"
276  << L.getHeader()->getName() << '\n');
277  LoopVector LoopList;
278  Loop *CurrentLoop = &L;
279  const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
280  while (!Vec->empty()) {
281  // The current loop has multiple subloops in it hence it is not tightly
282  // nested.
283  // Discard all loops above it added into Worklist.
284  if (Vec->size() != 1) {
285  LoopList.clear();
286  return;
287  }
288  LoopList.push_back(CurrentLoop);
289  CurrentLoop = Vec->front();
290  Vec = &CurrentLoop->getSubLoops();
291  }
292  LoopList.push_back(CurrentLoop);
293  V.push_back(std::move(LoopList));
294 }
295 
297  PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
298  if (InnerIndexVar)
299  return InnerIndexVar;
300  if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
301  return nullptr;
302  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
303  PHINode *PhiVar = cast<PHINode>(I);
304  Type *PhiTy = PhiVar->getType();
305  if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
306  !PhiTy->isPointerTy())
307  return nullptr;
308  const SCEVAddRecExpr *AddRec =
309  dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
310  if (!AddRec || !AddRec->isAffine())
311  continue;
312  const SCEV *Step = AddRec->getStepRecurrence(*SE);
313  if (!isa<SCEVConstant>(Step))
314  continue;
315  // Found the induction variable.
316  // FIXME: Handle loops with more than one induction variable. Note that,
317  // currently, legality makes sure we have only one induction variable.
318  return PhiVar;
319  }
320  return nullptr;
321 }
322 
323 namespace {
324 
325 /// LoopInterchangeLegality checks if it is legal to interchange the loop.
326 class LoopInterchangeLegality {
327 public:
328  LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
329  LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA,
331  : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
332  PreserveLCSSA(PreserveLCSSA), ORE(ORE) {}
333 
334  /// Check if the loops can be interchanged.
335  bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
336  CharMatrix &DepMatrix);
337 
338  /// Check if the loop structure is understood. We do not handle triangular
339  /// loops for now.
340  bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
341 
342  bool currentLimitations();
343 
344  bool hasInnerLoopReduction() { return InnerLoopHasReduction; }
345 
346 private:
347  bool tightlyNested(Loop *Outer, Loop *Inner);
348  bool containsUnsafeInstructionsInHeader(BasicBlock *BB);
349  bool areAllUsesReductions(Instruction *Ins, Loop *L);
350  bool containsUnsafeInstructionsInLatch(BasicBlock *BB);
351  bool findInductionAndReductions(Loop *L,
352  SmallVector<PHINode *, 8> &Inductions,
353  SmallVector<PHINode *, 8> &Reductions);
354 
355  Loop *OuterLoop;
356  Loop *InnerLoop;
357 
358  ScalarEvolution *SE;
359  LoopInfo *LI;
360  DominatorTree *DT;
361  bool PreserveLCSSA;
362 
363  /// Interface to emit optimization remarks.
365 
366  bool InnerLoopHasReduction = false;
367 };
368 
369 /// LoopInterchangeProfitability checks if it is profitable to interchange the
370 /// loop.
371 class LoopInterchangeProfitability {
372 public:
373  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
375  : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
376 
377  /// Check if the loop interchange is profitable.
378  bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
379  CharMatrix &DepMatrix);
380 
381 private:
382  int getInstrOrderCost();
383 
384  Loop *OuterLoop;
385  Loop *InnerLoop;
386 
387  /// Scev analysis.
388  ScalarEvolution *SE;
389 
390  /// Interface to emit optimization remarks.
392 };
393 
394 /// LoopInterchangeTransform interchanges the loop.
395 class LoopInterchangeTransform {
396 public:
397  LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
398  LoopInfo *LI, DominatorTree *DT,
399  BasicBlock *LoopNestExit,
400  bool InnerLoopContainsReductions)
401  : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
402  LoopExit(LoopNestExit),
403  InnerLoopHasReduction(InnerLoopContainsReductions) {}
404 
405  /// Interchange OuterLoop and InnerLoop.
406  bool transform();
407  void restructureLoops(Loop *InnerLoop, Loop *OuterLoop);
408  void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
409 
410 private:
411  void splitInnerLoopLatch(Instruction *);
412  void splitInnerLoopHeader();
413  bool adjustLoopLinks();
414  void adjustLoopPreheaders();
415  bool adjustLoopBranches();
416  void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred,
417  BasicBlock *NewPred);
418 
419  Loop *OuterLoop;
420  Loop *InnerLoop;
421 
422  /// Scev analysis.
423  ScalarEvolution *SE;
424 
425  LoopInfo *LI;
426  DominatorTree *DT;
427  BasicBlock *LoopExit;
428  bool InnerLoopHasReduction;
429 };
430 
431 // Main LoopInterchange Pass.
432 struct LoopInterchange : public FunctionPass {
433  static char ID;
434  ScalarEvolution *SE = nullptr;
435  LoopInfo *LI = nullptr;
436  DependenceInfo *DI = nullptr;
437  DominatorTree *DT = nullptr;
438  bool PreserveLCSSA;
439 
440  /// Interface to emit optimization remarks.
442 
443  LoopInterchange() : FunctionPass(ID) {
445  }
446 
447  void getAnalysisUsage(AnalysisUsage &AU) const override {
456  }
457 
458  bool runOnFunction(Function &F) override {
459  if (skipFunction(F))
460  return false;
461 
462  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
463  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
464  DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
465  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
466  DT = DTWP ? &DTWP->getDomTree() : nullptr;
467  ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
468  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
469 
470  // Build up a worklist of loop pairs to analyze.
472 
473  for (Loop *L : *LI)
474  populateWorklist(*L, Worklist);
475 
476  DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
477  bool Changed = true;
478  while (!Worklist.empty()) {
479  LoopVector LoopList = Worklist.pop_back_val();
480  Changed = processLoopList(LoopList, F);
481  }
482  return Changed;
483  }
484 
485  bool isComputableLoopNest(LoopVector LoopList) {
486  for (Loop *L : LoopList) {
487  const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
488  if (ExitCountOuter == SE->getCouldNotCompute()) {
489  DEBUG(dbgs() << "Couldn't compute backedge count\n");
490  return false;
491  }
492  if (L->getNumBackEdges() != 1) {
493  DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
494  return false;
495  }
496  if (!L->getExitingBlock()) {
497  DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
498  return false;
499  }
500  }
501  return true;
502  }
503 
504  unsigned selectLoopForInterchange(const LoopVector &LoopList) {
505  // TODO: Add a better heuristic to select the loop to be interchanged based
506  // on the dependence matrix. Currently we select the innermost loop.
507  return LoopList.size() - 1;
508  }
509 
510  bool processLoopList(LoopVector LoopList, Function &F) {
511  bool Changed = false;
512  unsigned LoopNestDepth = LoopList.size();
513  if (LoopNestDepth < 2) {
514  DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
515  return false;
516  }
517  if (LoopNestDepth > MaxLoopNestDepth) {
518  DEBUG(dbgs() << "Cannot handle loops of depth greater than "
519  << MaxLoopNestDepth << "\n");
520  return false;
521  }
522  if (!isComputableLoopNest(LoopList)) {
523  DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
524  return false;
525  }
526 
527  DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth << "\n");
528 
529  CharMatrix DependencyMatrix;
530  Loop *OuterMostLoop = *(LoopList.begin());
531  if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
532  OuterMostLoop, DI)) {
533  DEBUG(dbgs() << "Populating dependency matrix failed\n");
534  return false;
535  }
536 #ifdef DUMP_DEP_MATRICIES
537  DEBUG(dbgs() << "Dependence before interchange\n");
538  printDepMatrix(DependencyMatrix);
539 #endif
540 
541  BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch();
542  BranchInst *OuterMostLoopLatchBI =
543  dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator());
544  if (!OuterMostLoopLatchBI)
545  return false;
546 
547  // Since we currently do not handle LCSSA PHI's any failure in loop
548  // condition will now branch to LoopNestExit.
549  // TODO: This should be removed once we handle LCSSA PHI nodes.
550 
551  // Get the Outermost loop exit.
552  BasicBlock *LoopNestExit;
553  if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader())
554  LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1);
555  else
556  LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0);
557 
558  if (isa<PHINode>(LoopNestExit->begin())) {
559  DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now "
560  "since on failure all loops branch to loop nest exit.\n");
561  return false;
562  }
563 
564  unsigned SelecLoopId = selectLoopForInterchange(LoopList);
565  // Move the selected loop outwards to the best possible position.
566  for (unsigned i = SelecLoopId; i > 0; i--) {
567  bool Interchanged =
568  processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix);
569  if (!Interchanged)
570  return Changed;
571  // Loops interchanged reflect the same in LoopList
572  std::swap(LoopList[i - 1], LoopList[i]);
573 
574  // Update the DependencyMatrix
575  interChangeDependencies(DependencyMatrix, i, i - 1);
576  DT->recalculate(F);
577 #ifdef DUMP_DEP_MATRICIES
578  DEBUG(dbgs() << "Dependence after interchange\n");
579  printDepMatrix(DependencyMatrix);
580 #endif
581  Changed |= Interchanged;
582  }
583  return Changed;
584  }
585 
586  bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
587  unsigned OuterLoopId, BasicBlock *LoopNestExit,
588  std::vector<std::vector<char>> &DependencyMatrix) {
589  DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
590  << " and OuterLoopId = " << OuterLoopId << "\n");
591  Loop *InnerLoop = LoopList[InnerLoopId];
592  Loop *OuterLoop = LoopList[OuterLoopId];
593 
594  LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
595  PreserveLCSSA, ORE);
596  if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
597  DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
598  return false;
599  }
600  DEBUG(dbgs() << "Loops are legal to interchange\n");
601  LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
602  if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
603  DEBUG(dbgs() << "Interchanging loops not profitable\n");
604  return false;
605  }
606 
607  ORE->emit([&]() {
608  return OptimizationRemark(DEBUG_TYPE, "Interchanged",
609  InnerLoop->getStartLoc(),
610  InnerLoop->getHeader())
611  << "Loop interchanged with enclosing loop.";
612  });
613 
614  LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
615  LoopNestExit, LIL.hasInnerLoopReduction());
616  LIT.transform();
617  DEBUG(dbgs() << "Loops interchanged\n");
618  return true;
619  }
620 };
621 
622 } // end anonymous namespace
623 
624 bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
625  return llvm::none_of(Ins->users(), [=](User *U) -> bool {
626  auto *UserIns = dyn_cast<PHINode>(U);
628  return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD);
629  });
630 }
631 
632 bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
633  BasicBlock *BB) {
634  for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
635  // Load corresponding to reduction PHI's are safe while concluding if
636  // tightly nested.
637  if (LoadInst *L = dyn_cast<LoadInst>(I)) {
638  if (!areAllUsesReductions(L, InnerLoop))
639  return true;
640  } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
641  return true;
642  }
643  return false;
644 }
645 
646 bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch(
647  BasicBlock *BB) {
648  for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
649  // Stores corresponding to reductions are safe while concluding if tightly
650  // nested.
651  if (StoreInst *L = dyn_cast<StoreInst>(I)) {
652  if (!isa<PHINode>(L->getOperand(0)))
653  return true;
654  } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
655  return true;
656  }
657  return false;
658 }
659 
660 bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
661  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
662  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
663  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
664 
665  DEBUG(dbgs() << "Checking if loops are tightly nested\n");
666 
667  // A perfectly nested loop will not have any branch in between the outer and
668  // inner block i.e. outer header will branch to either inner preheader and
669  // outerloop latch.
670  BranchInst *OuterLoopHeaderBI =
671  dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
672  if (!OuterLoopHeaderBI)
673  return false;
674 
675  for (BasicBlock *Succ : OuterLoopHeaderBI->successors())
676  if (Succ != InnerLoopPreHeader && Succ != OuterLoopLatch)
677  return false;
678 
679  DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
680  // We do not have any basic block in between now make sure the outer header
681  // and outer loop latch doesn't contain any unsafe instructions.
682  if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||
683  containsUnsafeInstructionsInLatch(OuterLoopLatch))
684  return false;
685 
686  DEBUG(dbgs() << "Loops are perfectly nested\n");
687  // We have a perfect loop nest.
688  return true;
689 }
690 
691 bool LoopInterchangeLegality::isLoopStructureUnderstood(
692  PHINode *InnerInduction) {
693  unsigned Num = InnerInduction->getNumOperands();
694  BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
695  for (unsigned i = 0; i < Num; ++i) {
696  Value *Val = InnerInduction->getOperand(i);
697  if (isa<Constant>(Val))
698  continue;
700  if (!I)
701  return false;
702  // TODO: Handle triangular loops.
703  // e.g. for(int i=0;i<N;i++)
704  // for(int j=i;j<N;j++)
705  unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
706  if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
707  InnerLoopPreheader &&
708  !OuterLoop->isLoopInvariant(I)) {
709  return false;
710  }
711  }
712  return true;
713 }
714 
715 bool LoopInterchangeLegality::findInductionAndReductions(
716  Loop *L, SmallVector<PHINode *, 8> &Inductions,
717  SmallVector<PHINode *, 8> &Reductions) {
718  if (!L->getLoopLatch() || !L->getLoopPredecessor())
719  return false;
720  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
723  PHINode *PHI = cast<PHINode>(I);
724  if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID))
725  Inductions.push_back(PHI);
726  else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
727  Reductions.push_back(PHI);
728  else {
729  DEBUG(
730  dbgs() << "Failed to recognize PHI as an induction or reduction.\n");
731  return false;
732  }
733  }
734  return true;
735 }
736 
737 static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
738  for (auto I = Block->begin(); isa<PHINode>(I); ++I) {
739  PHINode *PHI = cast<PHINode>(I);
740  // Reduction lcssa phi will have only 1 incoming block that from loop latch.
741  if (PHI->getNumIncomingValues() > 1)
742  return false;
744  if (!Ins)
745  return false;
746  // Incoming value for lcssa phi's in outer loop exit can only be inner loop
747  // exits lcssa phi else it would not be tightly nested.
748  if (!isa<PHINode>(Ins) && isOuterLoopExitBlock)
749  return false;
750  }
751  return true;
752 }
753 
755  BasicBlock *LoopHeader) {
756  if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) {
757  assert(BI->getNumSuccessors() == 2 &&
758  "Branch leaving loop latch must have 2 successors");
759  for (BasicBlock *Succ : BI->successors()) {
760  if (Succ == LoopHeader)
761  continue;
762  return Succ;
763  }
764  }
765  return nullptr;
766 }
767 
768 // This function indicates the current limitations in the transform as a result
769 // of which we do not proceed.
770 bool LoopInterchangeLegality::currentLimitations() {
771  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
772  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
773  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
774  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
775  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
776 
777  PHINode *InnerInductionVar;
778  SmallVector<PHINode *, 8> Inductions;
779  SmallVector<PHINode *, 8> Reductions;
780  if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
781  DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes "
782  << "are supported currently.\n");
783  ORE->emit([&]() {
784  return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
785  InnerLoop->getStartLoc(),
786  InnerLoop->getHeader())
787  << "Only inner loops with induction or reduction PHI nodes can be"
788  " interchange currently.";
789  });
790  return true;
791  }
792 
793  // TODO: Currently we handle only loops with 1 induction variable.
794  if (Inductions.size() != 1) {
795  DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
796  << "Failed to interchange due to current limitation\n");
797  ORE->emit([&]() {
798  return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
799  InnerLoop->getStartLoc(),
800  InnerLoop->getHeader())
801  << "Only inner loops with 1 induction variable can be "
802  "interchanged currently.";
803  });
804  return true;
805  }
806  if (Reductions.size() > 0)
807  InnerLoopHasReduction = true;
808 
809  InnerInductionVar = Inductions.pop_back_val();
810  Reductions.clear();
811  if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
812  DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes "
813  << "are supported currently.\n");
814  ORE->emit([&]() {
815  return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
816  OuterLoop->getStartLoc(),
817  OuterLoop->getHeader())
818  << "Only outer loops with induction or reduction PHI nodes can be"
819  " interchanged currently.";
820  });
821  return true;
822  }
823 
824  // Outer loop cannot have reduction because then loops will not be tightly
825  // nested.
826  if (!Reductions.empty()) {
827  DEBUG(dbgs() << "Outer loops with reductions are not supported "
828  << "currently.\n");
829  ORE->emit([&]() {
830  return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter",
831  OuterLoop->getStartLoc(),
832  OuterLoop->getHeader())
833  << "Outer loops with reductions cannot be interchangeed "
834  "currently.";
835  });
836  return true;
837  }
838  // TODO: Currently we handle only loops with 1 induction variable.
839  if (Inductions.size() != 1) {
840  DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
841  << "supported currently.\n");
842  ORE->emit([&]() {
843  return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
844  OuterLoop->getStartLoc(),
845  OuterLoop->getHeader())
846  << "Only outer loops with 1 induction variable can be "
847  "interchanged currently.";
848  });
849  return true;
850  }
851 
852  // TODO: Triangular loops are not handled for now.
853  if (!isLoopStructureUnderstood(InnerInductionVar)) {
854  DEBUG(dbgs() << "Loop structure not understood by pass\n");
855  ORE->emit([&]() {
856  return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
857  InnerLoop->getStartLoc(),
858  InnerLoop->getHeader())
859  << "Inner loop structure not understood currently.";
860  });
861  return true;
862  }
863 
864  // TODO: We only handle LCSSA PHI's corresponding to reduction for now.
865  BasicBlock *LoopExitBlock =
866  getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
867  if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) {
868  DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
869  ORE->emit([&]() {
870  return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuter",
871  OuterLoop->getStartLoc(),
872  OuterLoop->getHeader())
873  << "Only outer loops with LCSSA PHIs can be interchange "
874  "currently.";
875  });
876  return true;
877  }
878 
879  LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
880  if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) {
881  DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
882  ORE->emit([&]() {
883  return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner",
884  InnerLoop->getStartLoc(),
885  InnerLoop->getHeader())
886  << "Only inner loops with LCSSA PHIs can be interchange "
887  "currently.";
888  });
889  return true;
890  }
891 
892  // TODO: Current limitation: Since we split the inner loop latch at the point
893  // were induction variable is incremented (induction.next); We cannot have
894  // more than 1 user of induction.next since it would result in broken code
895  // after split.
896  // e.g.
897  // for(i=0;i<N;i++) {
898  // for(j = 0;j<M;j++) {
899  // A[j+1][i+2] = A[j][i]+k;
900  // }
901  // }
902  Instruction *InnerIndexVarInc = nullptr;
903  if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
904  InnerIndexVarInc =
905  dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
906  else
907  InnerIndexVarInc =
908  dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
909 
910  if (!InnerIndexVarInc) {
911  DEBUG(dbgs() << "Did not find an instruction to increment the induction "
912  << "variable.\n");
913  ORE->emit([&]() {
914  return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner",
915  InnerLoop->getStartLoc(),
916  InnerLoop->getHeader())
917  << "The inner loop does not increment the induction variable.";
918  });
919  return true;
920  }
921 
922  // Since we split the inner loop latch on this induction variable. Make sure
923  // we do not have any instruction between the induction variable and branch
924  // instruction.
925 
926  bool FoundInduction = false;
927  for (const Instruction &I : llvm::reverse(*InnerLoopLatch)) {
928  if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) ||
929  isa<ZExtInst>(I))
930  continue;
931 
932  // We found an instruction. If this is not induction variable then it is not
933  // safe to split this loop latch.
934  if (!I.isIdenticalTo(InnerIndexVarInc)) {
935  DEBUG(dbgs() << "Found unsupported instructions between induction "
936  << "variable increment and branch.\n");
937  ORE->emit([&]() {
939  DEBUG_TYPE, "UnsupportedInsBetweenInduction",
940  InnerLoop->getStartLoc(), InnerLoop->getHeader())
941  << "Found unsupported instruction between induction variable "
942  "increment and branch.";
943  });
944  return true;
945  }
946 
947  FoundInduction = true;
948  break;
949  }
950  // The loop latch ended and we didn't find the induction variable return as
951  // current limitation.
952  if (!FoundInduction) {
953  DEBUG(dbgs() << "Did not find the induction variable.\n");
954  ORE->emit([&]() {
955  return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable",
956  InnerLoop->getStartLoc(),
957  InnerLoop->getHeader())
958  << "Did not find the induction variable.";
959  });
960  return true;
961  }
962  return false;
963 }
964 
965 bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
966  unsigned OuterLoopId,
967  CharMatrix &DepMatrix) {
968  if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
969  DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
970  << " and OuterLoopId = " << OuterLoopId
971  << " due to dependence\n");
972  ORE->emit([&]() {
973  return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
974  InnerLoop->getStartLoc(),
975  InnerLoop->getHeader())
976  << "Cannot interchange loops due to dependences.";
977  });
978  return false;
979  }
980 
981  // Check if outer and inner loop contain legal instructions only.
982  for (auto *BB : OuterLoop->blocks())
983  for (Instruction &I : *BB)
984  if (CallInst *CI = dyn_cast<CallInst>(&I)) {
985  // readnone functions do not prevent interchanging.
986  if (CI->doesNotReadMemory())
987  continue;
988  DEBUG(dbgs() << "Loops with call instructions cannot be interchanged "
989  << "safely.");
990  return false;
991  }
992 
993  // Create unique Preheaders if we already do not have one.
994  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
995  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
996 
997  // Create a unique outer preheader -
998  // 1) If OuterLoop preheader is not present.
999  // 2) If OuterLoop Preheader is same as OuterLoop Header
1000  // 3) If OuterLoop Preheader is same as Header of the previous loop.
1001  // 4) If OuterLoop Preheader is Entry node.
1002  if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() ||
1003  isa<PHINode>(OuterLoopPreHeader->begin()) ||
1004  !OuterLoopPreHeader->getUniquePredecessor()) {
1005  OuterLoopPreHeader =
1006  InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA);
1007  }
1008 
1009  if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() ||
1010  InnerLoopPreHeader == OuterLoop->getHeader()) {
1011  InnerLoopPreHeader =
1012  InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA);
1013  }
1014 
1015  // TODO: The loops could not be interchanged due to current limitations in the
1016  // transform module.
1017  if (currentLimitations()) {
1018  DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1019  return false;
1020  }
1021 
1022  // Check if the loops are tightly nested.
1023  if (!tightlyNested(OuterLoop, InnerLoop)) {
1024  DEBUG(dbgs() << "Loops not tightly nested\n");
1025  ORE->emit([&]() {
1026  return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1027  InnerLoop->getStartLoc(),
1028  InnerLoop->getHeader())
1029  << "Cannot interchange loops because they are not tightly "
1030  "nested.";
1031  });
1032  return false;
1033  }
1034 
1035  return true;
1036 }
1037 
1038 int LoopInterchangeProfitability::getInstrOrderCost() {
1039  unsigned GoodOrder, BadOrder;
1040  BadOrder = GoodOrder = 0;
1041  for (BasicBlock *BB : InnerLoop->blocks()) {
1042  for (Instruction &Ins : *BB) {
1043  if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
1044  unsigned NumOp = GEP->getNumOperands();
1045  bool FoundInnerInduction = false;
1046  bool FoundOuterInduction = false;
1047  for (unsigned i = 0; i < NumOp; ++i) {
1048  const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
1049  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1050  if (!AR)
1051  continue;
1052 
1053  // If we find the inner induction after an outer induction e.g.
1054  // for(int i=0;i<N;i++)
1055  // for(int j=0;j<N;j++)
1056  // A[i][j] = A[i-1][j-1]+k;
1057  // then it is a good order.
1058  if (AR->getLoop() == InnerLoop) {
1059  // We found an InnerLoop induction after OuterLoop induction. It is
1060  // a good order.
1061  FoundInnerInduction = true;
1062  if (FoundOuterInduction) {
1063  GoodOrder++;
1064  break;
1065  }
1066  }
1067  // If we find the outer induction after an inner induction e.g.
1068  // for(int i=0;i<N;i++)
1069  // for(int j=0;j<N;j++)
1070  // A[j][i] = A[j-1][i-1]+k;
1071  // then it is a bad order.
1072  if (AR->getLoop() == OuterLoop) {
1073  // We found an OuterLoop induction after InnerLoop induction. It is
1074  // a bad order.
1075  FoundOuterInduction = true;
1076  if (FoundInnerInduction) {
1077  BadOrder++;
1078  break;
1079  }
1080  }
1081  }
1082  }
1083  }
1084  }
1085  return GoodOrder - BadOrder;
1086 }
1087 
1088 static bool isProfitableForVectorization(unsigned InnerLoopId,
1089  unsigned OuterLoopId,
1090  CharMatrix &DepMatrix) {
1091  // TODO: Improve this heuristic to catch more cases.
1092  // If the inner loop is loop independent or doesn't carry any dependency it is
1093  // profitable to move this to outer position.
1094  for (auto &Row : DepMatrix) {
1095  if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
1096  return false;
1097  // TODO: We need to improve this heuristic.
1098  if (Row[OuterLoopId] != '=')
1099  return false;
1100  }
1101  // If outer loop has dependence and inner loop is loop independent then it is
1102  // profitable to interchange to enable parallelism.
1103  return true;
1104 }
1105 
1106 bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
1107  unsigned OuterLoopId,
1108  CharMatrix &DepMatrix) {
1109  // TODO: Add better profitability checks.
1110  // e.g
1111  // 1) Construct dependency matrix and move the one with no loop carried dep
1112  // inside to enable vectorization.
1113 
1114  // This is rough cost estimation algorithm. It counts the good and bad order
1115  // of induction variables in the instruction and allows reordering if number
1116  // of bad orders is more than good.
1117  int Cost = getInstrOrderCost();
1118  DEBUG(dbgs() << "Cost = " << Cost << "\n");
1119  if (Cost < -LoopInterchangeCostThreshold)
1120  return true;
1121 
1122  // It is not profitable as per current cache profitability model. But check if
1123  // we can move this loop outside to improve parallelism.
1124  if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
1125  return true;
1126 
1127  ORE->emit([&]() {
1128  return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1129  InnerLoop->getStartLoc(),
1130  InnerLoop->getHeader())
1131  << "Interchanging loops is too costly (cost="
1132  << ore::NV("Cost", Cost) << ", threshold="
1133  << ore::NV("Threshold", LoopInterchangeCostThreshold)
1134  << ") and it does not improve parallelism.";
1135  });
1136  return false;
1137 }
1138 
1139 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1140  Loop *InnerLoop) {
1141  for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E;
1142  ++I) {
1143  if (*I == InnerLoop) {
1144  OuterLoop->removeChildLoop(I);
1145  return;
1146  }
1147  }
1148  llvm_unreachable("Couldn't find loop");
1149 }
1150 
1151 void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop,
1152  Loop *OuterLoop) {
1153  Loop *OuterLoopParent = OuterLoop->getParentLoop();
1154  if (OuterLoopParent) {
1155  // Remove the loop from its parent loop.
1156  removeChildLoop(OuterLoopParent, OuterLoop);
1157  removeChildLoop(OuterLoop, InnerLoop);
1158  OuterLoopParent->addChildLoop(InnerLoop);
1159  } else {
1160  removeChildLoop(OuterLoop, InnerLoop);
1161  LI->changeTopLevelLoop(OuterLoop, InnerLoop);
1162  }
1163 
1164  while (!InnerLoop->empty())
1165  OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin()));
1166 
1167  InnerLoop->addChildLoop(OuterLoop);
1168 }
1169 
1171  bool Transformed = false;
1172  Instruction *InnerIndexVar;
1173 
1174  if (InnerLoop->getSubLoops().empty()) {
1175  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1176  DEBUG(dbgs() << "Calling Split Inner Loop\n");
1177  PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
1178  if (!InductionPHI) {
1179  DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1180  return false;
1181  }
1182 
1183  if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1184  InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
1185  else
1186  InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
1187 
1188  // Ensure that InductionPHI is the first Phi node as required by
1189  // splitInnerLoopHeader
1190  if (&InductionPHI->getParent()->front() != InductionPHI)
1191  InductionPHI->moveBefore(&InductionPHI->getParent()->front());
1192 
1193  // Split at the place were the induction variable is
1194  // incremented/decremented.
1195  // TODO: This splitting logic may not work always. Fix this.
1196  splitInnerLoopLatch(InnerIndexVar);
1197  DEBUG(dbgs() << "splitInnerLoopLatch done\n");
1198 
1199  // Splits the inner loops phi nodes out into a separate basic block.
1200  splitInnerLoopHeader();
1201  DEBUG(dbgs() << "splitInnerLoopHeader done\n");
1202  }
1203 
1204  Transformed |= adjustLoopLinks();
1205  if (!Transformed) {
1206  DEBUG(dbgs() << "adjustLoopLinks failed\n");
1207  return false;
1208  }
1209 
1210  restructureLoops(InnerLoop, OuterLoop);
1211  return true;
1212 }
1213 
1214 void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
1215  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1216  BasicBlock *InnerLoopLatchPred = InnerLoopLatch;
1217  InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI);
1218 }
1219 
1220 void LoopInterchangeTransform::splitInnerLoopHeader() {
1221  // Split the inner loop header out. Here make sure that the reduction PHI's
1222  // stay in the innerloop body.
1223  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1224  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1225  if (InnerLoopHasReduction) {
1226  // Note: The induction PHI must be the first PHI for this to work
1227  BasicBlock *New = InnerLoopHeader->splitBasicBlock(
1228  ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split");
1229  if (LI)
1230  if (Loop *L = LI->getLoopFor(InnerLoopHeader))
1231  L->addBasicBlockToLoop(New, *LI);
1232 
1233  // Adjust Reduction PHI's in the block.
1235  for (auto I = New->begin(); isa<PHINode>(I); ++I) {
1236  PHINode *PHI = dyn_cast<PHINode>(I);
1237  Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader);
1238  PHI->replaceAllUsesWith(V);
1239  PHIVec.push_back((PHI));
1240  }
1241  for (PHINode *P : PHIVec) {
1242  P->eraseFromParent();
1243  }
1244  } else {
1245  SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
1246  }
1247 
1248  DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & "
1249  "InnerLoopHeader\n");
1250 }
1251 
1252 /// \brief Move all instructions except the terminator from FromBB right before
1253 /// InsertBefore
1254 static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
1255  auto &ToList = InsertBefore->getParent()->getInstList();
1256  auto &FromList = FromBB->getInstList();
1257 
1258  ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(),
1259  FromBB->getTerminator()->getIterator());
1260 }
1261 
1262 void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock,
1263  BasicBlock *OldPred,
1264  BasicBlock *NewPred) {
1265  for (auto I = CurrBlock->begin(); isa<PHINode>(I); ++I) {
1266  PHINode *PHI = cast<PHINode>(I);
1267  unsigned Num = PHI->getNumIncomingValues();
1268  for (unsigned i = 0; i < Num; ++i) {
1269  if (PHI->getIncomingBlock(i) == OldPred)
1270  PHI->setIncomingBlock(i, NewPred);
1271  }
1272  }
1273 }
1274 
1275 bool LoopInterchangeTransform::adjustLoopBranches() {
1276  DEBUG(dbgs() << "adjustLoopBranches called\n");
1277  // Adjust the loop preheader
1278  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1279  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1280  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1281  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1282  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1283  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1284  BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1285  BasicBlock *InnerLoopLatchPredecessor =
1286  InnerLoopLatch->getUniquePredecessor();
1287  BasicBlock *InnerLoopLatchSuccessor;
1288  BasicBlock *OuterLoopLatchSuccessor;
1289 
1290  BranchInst *OuterLoopLatchBI =
1291  dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
1292  BranchInst *InnerLoopLatchBI =
1293  dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
1294  BranchInst *OuterLoopHeaderBI =
1295  dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
1296  BranchInst *InnerLoopHeaderBI =
1297  dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
1298 
1299  if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
1300  !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
1301  !InnerLoopHeaderBI)
1302  return false;
1303 
1304  BranchInst *InnerLoopLatchPredecessorBI =
1305  dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
1306  BranchInst *OuterLoopPredecessorBI =
1307  dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
1308 
1309  if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
1310  return false;
1311  BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
1312  if (!InnerLoopHeaderSuccessor)
1313  return false;
1314 
1315  // Adjust Loop Preheader and headers
1316 
1317  unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors();
1318  for (unsigned i = 0; i < NumSucc; ++i) {
1319  if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader)
1320  OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader);
1321  }
1322 
1323  NumSucc = OuterLoopHeaderBI->getNumSuccessors();
1324  for (unsigned i = 0; i < NumSucc; ++i) {
1325  if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch)
1326  OuterLoopHeaderBI->setSuccessor(i, LoopExit);
1327  else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader)
1328  OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor);
1329  }
1330 
1331  // Adjust reduction PHI's now that the incoming block has changed.
1332  updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader,
1333  OuterLoopHeader);
1334 
1335  BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI);
1336  InnerLoopHeaderBI->eraseFromParent();
1337 
1338  // -------------Adjust loop latches-----------
1339  if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
1340  InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
1341  else
1342  InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
1343 
1344  NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors();
1345  for (unsigned i = 0; i < NumSucc; ++i) {
1346  if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch)
1347  InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor);
1348  }
1349 
1350  // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with
1351  // the value and remove this PHI node from inner loop.
1352  SmallVector<PHINode *, 8> LcssaVec;
1353  for (auto I = InnerLoopLatchSuccessor->begin(); isa<PHINode>(I); ++I) {
1354  PHINode *LcssaPhi = cast<PHINode>(I);
1355  LcssaVec.push_back(LcssaPhi);
1356  }
1357  for (PHINode *P : LcssaVec) {
1358  Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch);
1359  P->replaceAllUsesWith(Incoming);
1360  P->eraseFromParent();
1361  }
1362 
1363  if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
1364  OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
1365  else
1366  OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
1367 
1368  if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor)
1369  InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor);
1370  else
1371  InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor);
1372 
1373  updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch);
1374 
1375  if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) {
1376  OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch);
1377  } else {
1378  OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch);
1379  }
1380 
1381  return true;
1382 }
1383 
1384 void LoopInterchangeTransform::adjustLoopPreheaders() {
1385  // We have interchanged the preheaders so we need to interchange the data in
1386  // the preheader as well.
1387  // This is because the content of inner preheader was previously executed
1388  // inside the outer loop.
1389  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1390  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1391  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1392  BranchInst *InnerTermBI =
1393  cast<BranchInst>(InnerLoopPreHeader->getTerminator());
1394 
1395  // These instructions should now be executed inside the loop.
1396  // Move instruction into a new block after outer header.
1397  moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator());
1398  // These instructions were not executed previously in the loop so move them to
1399  // the older inner loop preheader.
1400  moveBBContents(OuterLoopPreHeader, InnerTermBI);
1401 }
1402 
1403 bool LoopInterchangeTransform::adjustLoopLinks() {
1404  // Adjust all branches in the inner and outer loop.
1405  bool Changed = adjustLoopBranches();
1406  if (Changed)
1407  adjustLoopPreheaders();
1408  return Changed;
1409 }
1410 
1411 char LoopInterchange::ID = 0;
1412 
1413 INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
1414  "Interchanges loops for cache reuse", false, false)
1419 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
1420 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
1423 
1424 INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
1425  "Interchanges loops for cache reuse", false, false)
1426 
1427 Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); }
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:81
void push_back(const T &Elt)
Definition: SmallVector.h:212
Diagnostic information for missed-optimization remarks.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:157
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
DiagnosticInfoOptimizationBase::Argument NV
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
BasicBlock * SplitBlock(BasicBlock *Old, Instruction *SplitPt, DominatorTree *DT=nullptr, LoopInfo *LI=nullptr)
Split the specified block at the specified instruction - everything before SplitPt stays in Old and e...
LLVM_ATTRIBUTE_ALWAYS_INLINE size_type size() const
Definition: SmallVector.h:136
Legacy pass manager pass to access dependence information.
static bool isProfitableForVectorization(unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix)
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
LoopT * removeChildLoop(iterator I)
This removes the specified child from being a subloop of this loop.
Definition: LoopInfo.h:320
The main scalar evolution driver.
This class represents a function call, abstracting a target machine&#39;s calling convention.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:106
static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row, unsigned Column)
static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes)
Returns true if Phi is a reduction in TheLoop.
Definition: LoopUtils.cpp:482
BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn&#39;t have a preheader, this method is called...
BasicBlock * getSuccessor(unsigned i) const
F(f)
An instruction for reading from memory.
Definition: Instructions.h:164
Hexagon Common GEP
DependenceInfo - This class is the main dependence-analysis driver.
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:252
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
unsigned getNumSuccessors() const
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:781
BlockT * getHeader() const
Definition: LoopInfo.h:100
ConstantInt * getValue() const
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:186
std::vector< Loop *>::const_iterator iterator
Definition: LoopInfo.h:139
static cl::opt< int > LoopInterchangeCostThreshold("loop-interchange-threshold", cl::init(0), cl::Hidden, cl::desc("Interchange if you gain more than this number"))
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId)
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
static BasicBlock * getLoopLatchExitBlock(BasicBlock *LatchBlock, BasicBlock *LoopHeader)
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
Definition: LoopInfoImpl.h:183
This node represents a polynomial recurrence on the trip count of the specified loop.
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:230
static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row, unsigned OuterLoopId, char InnerDep, char OuterDep)
static const unsigned MaxLoopNestDepth
An instruction for storing to memory.
Definition: Instructions.h:306
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:430
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
loop interchange
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:140
Value * getOperand(unsigned i) const
Definition: User.h:154
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:837
const SCEV * getCouldNotCompute()
succ_range successors()
Definition: InstrTypes.h:267
static bool runOnFunction(Function &F, bool PostInlining)
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:406
std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool PossiblyLoopIndependent)
depends - Tests for a dependence between the Src and Dst instructions.
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:171
LLVM Basic Block Representation.
Definition: BasicBlock.h:59
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
Conditional or Unconditional Branch instruction.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Value * getIncomingValueForBlock(const BasicBlock *BB) const
This file contains the declarations for the subclasses of Constant, which represent the different fla...
const Instruction & front() const
Definition: BasicBlock.h:264
char & LCSSAID
Definition: LCSSA.cpp:412
loop Interchanges loops for cache reuse
Diagnostic information for applied optimization remarks.
Represent the analysis usage information of a pass.
void splice(iterator where, iplist_impl &L2)
Definition: ilist.h:342
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values...
self_iterator getIterator()
Definition: ilist_node.h:82
Used in the streaming interface as the general argument type.
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:345
static unsigned getIncomingValueNumForOperand(unsigned i)
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
void initializeLoopInterchangePass(PassRegistry &)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:56
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
bool isNegative() const
Definition: Constants.h:188
char & LoopSimplifyID
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: LoopUtils.h:73
#define DEBUG_TYPE
BlockT * getLoopPredecessor() const
If the given loop&#39;s header has exactly one unique predecessor outside the loop, return it...
Definition: LoopInfoImpl.h:134
const InstListType & getInstList() const
Return the underlying instruction list container.
Definition: BasicBlock.h:317
Iterator for intrusive lists based on ilist_node.
unsigned getNumOperands() const
Definition: User.h:176
static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock)
This is the shared class of boolean and integer constants.
Definition: Constants.h:84
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file. ...
A struct for saving information about induction variables.
Definition: LoopUtils.h:271
void setIncomingBlock(unsigned i, BasicBlock *BB)
Pass * createLoopInterchangePass()
iterator end()
Definition: BasicBlock.h:254
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:298
static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore)
Move all instructions except the terminator from FromBB right before InsertBefore.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:864
iterator begin() const
Definition: LoopInfo.h:142
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:385
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static const unsigned MaxMemInstrCount
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:923
PHINode * getCanonicalInductionVariable() const
Check to see if the loop has a canonical induction variable: an integer recurrence that starts at 0 a...
Definition: LoopInfo.cpp:111
iterator_range< user_iterator > users()
Definition: Value.h:401
INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange", "Interchanges loops for cache reuse", false, false) INITIALIZE_PASS_END(LoopInterchange
LoopT * getParentLoop() const
Definition: LoopInfo.h:101
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
Definition: LoopInfo.h:131
This class represents an analyzed expression in the program.
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
Definition: LoopInfo.h:311
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:61
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:439
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:220
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:108
#define I(x, y, z)
Definition: MD5.cpp:58
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:193
iterator end() const
Definition: LoopInfo.h:143
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:382
static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row, unsigned Column)
OutputIt transform(R &&Range, OutputIt d_first, UnaryPredicate P)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere...
Definition: STLExtras.h:845
static void populateWorklist(Loop &L, SmallVector< LoopVector, 8 > &V)
bool empty() const
Definition: LoopInfo.h:146
const SCEV * getBackedgeTakenCount(const Loop *L)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, unsigned ToIndx)
LLVM Value Representation.
Definition: Value.h:73
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
OptimizationRemarkEmitter legacy analysis pass.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:88
#define DEBUG(X)
Definition: Debug.h:118
The legacy pass manager&#39;s analysis pass to compute loop information.
Definition: LoopInfo.h:958
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:267
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
const TerminatorInst * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:120
for(unsigned i=Desc.getNumOperands(), e=OldMI.getNumOperands();i !=e;++i)
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:156
The optimization diagnostic interface.
static bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr=nullptr)
Returns true if Phi is an induction in the loop L.
Definition: LoopUtils.cpp:876
loops
Definition: LoopInfo.cpp:750
const BasicBlock * getParent() const
Definition: Instruction.h:66
This class represents a constant integer value.
static PHINode * getInductionVariable(Loop *L, ScalarEvolution *SE)
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:252