LLVM 20.0.0git
LoopInterchange.cpp
Go to the documentation of this file.
1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/ADT/StringRef.h"
20#include "llvm/ADT/StringSet.h"
29#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/Dominators.h"
32#include "llvm/IR/Function.h"
33#include "llvm/IR/InstrTypes.h"
34#include "llvm/IR/Instruction.h"
36#include "llvm/IR/User.h"
37#include "llvm/IR/Value.h"
40#include "llvm/Support/Debug.h"
46#include <cassert>
47#include <utility>
48#include <vector>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "loop-interchange"
53
54STATISTIC(LoopsInterchanged, "Number of loops interchanged");
55
57 "loop-interchange-threshold", cl::init(0), cl::Hidden,
58 cl::desc("Interchange if you gain more than this number"));
59
60// Maximum number of load-stores that can be handled in the dependency matrix.
62 "loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden,
64 "Maximum number of load-store instructions that should be handled "
65 "in the dependency matrix. Higher value may lead to more interchanges "
66 "at the cost of compile-time"));
67
68namespace {
69
70using LoopVector = SmallVector<Loop *, 8>;
71
72// TODO: Check if we can use a sparse matrix here.
73using CharMatrix = std::vector<std::vector<char>>;
74
75} // end anonymous namespace
76
77// Minimum loop depth supported.
79 "loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden,
80 cl::desc("Minimum depth of loop nest considered for the transform"));
81
82// Maximum loop depth supported.
84 "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
85 cl::desc("Maximum depth of loop nest considered for the transform"));
86
87#ifndef NDEBUG
88static void printDepMatrix(CharMatrix &DepMatrix) {
89 for (auto &Row : DepMatrix) {
90 for (auto D : Row)
91 LLVM_DEBUG(dbgs() << D << " ");
92 LLVM_DEBUG(dbgs() << "\n");
93 }
94}
95#endif
96
97static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
98 Loop *L, DependenceInfo *DI,
101 using ValueVector = SmallVector<Value *, 16>;
102
103 ValueVector MemInstr;
104
105 // For each block.
106 for (BasicBlock *BB : L->blocks()) {
107 // Scan the BB and collect legal loads and stores.
108 for (Instruction &I : *BB) {
109 if (!isa<Instruction>(I))
110 return false;
111 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
112 if (!Ld->isSimple())
113 return false;
114 MemInstr.push_back(&I);
115 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
116 if (!St->isSimple())
117 return false;
118 MemInstr.push_back(&I);
119 }
120 }
121 }
122
123 LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
124 << " Loads and Stores to analyze\n");
125 if (MemInstr.size() > MaxMemInstrCount) {
126 LLVM_DEBUG(dbgs() << "The transform doesn't support more than "
127 << MaxMemInstrCount << " load/stores in a loop\n");
128 ORE->emit([&]() {
129 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
130 L->getStartLoc(), L->getHeader())
131 << "Number of loads/stores exceeded, the supported maximum "
132 "can be increased with option "
133 "-loop-interchange-maxmeminstr-count.";
134 });
135 return false;
136 }
137 ValueVector::iterator I, IE, J, JE;
138 StringSet<> Seen;
139
140 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
141 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
142 std::vector<char> Dep;
143 Instruction *Src = cast<Instruction>(*I);
144 Instruction *Dst = cast<Instruction>(*J);
145 // Ignore Input dependencies.
146 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
147 continue;
148 // Track Output, Flow, and Anti dependencies.
149 if (auto D = DI->depends(Src, Dst, true)) {
150 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
151 // If the direction vector is negative, normalize it to
152 // make it non-negative.
153 if (D->normalize(SE))
154 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
155 LLVM_DEBUG(StringRef DepType =
156 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
157 dbgs() << "Found " << DepType
158 << " dependency between Src and Dst\n"
159 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
160 unsigned Levels = D->getLevels();
161 char Direction;
162 for (unsigned II = 1; II <= Levels; ++II) {
163 unsigned Dir = D->getDirection(II);
165 Direction = '<';
166 else if (Dir == Dependence::DVEntry::GT ||
168 Direction = '>';
169 else if (Dir == Dependence::DVEntry::EQ)
170 Direction = '=';
171 else
172 Direction = '*';
173 Dep.push_back(Direction);
174 }
175 while (Dep.size() != Level) {
176 Dep.push_back('I');
177 }
178
179 // Make sure we only add unique entries to the dependency matrix.
180 if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
181 DepMatrix.push_back(Dep);
182 }
183 }
184 }
185
186 return true;
187}
188
189// A loop is moved from index 'from' to an index 'to'. Update the Dependence
190// matrix by exchanging the two columns.
191static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
192 unsigned ToIndx) {
193 for (unsigned I = 0, E = DepMatrix.size(); I < E; ++I)
194 std::swap(DepMatrix[I][ToIndx], DepMatrix[I][FromIndx]);
195}
196
197// After interchanging, check if the direction vector is valid.
198// [Theorem] A permutation of the loops in a perfect nest is legal if and only
199// if the direction matrix, after the same permutation is applied to its
200// columns, has no ">" direction as the leftmost non-"=" direction in any row.
201static bool isLexicographicallyPositive(std::vector<char> &DV) {
202 for (unsigned char Direction : DV) {
203 if (Direction == '<')
204 return true;
205 if (Direction == '>' || Direction == '*')
206 return false;
207 }
208 return true;
209}
210
211// Checks if it is legal to interchange 2 loops.
212static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
213 unsigned InnerLoopId,
214 unsigned OuterLoopId) {
215 unsigned NumRows = DepMatrix.size();
216 std::vector<char> Cur;
217 // For each row check if it is valid to interchange.
218 for (unsigned Row = 0; Row < NumRows; ++Row) {
219 // Create temporary DepVector check its lexicographical order
220 // before and after swapping OuterLoop vs InnerLoop
221 Cur = DepMatrix[Row];
223 return false;
224 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
226 return false;
227 }
228 return true;
229}
230
231static void populateWorklist(Loop &L, LoopVector &LoopList) {
232 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
233 << L.getHeader()->getParent()->getName() << " Loop: %"
234 << L.getHeader()->getName() << '\n');
235 assert(LoopList.empty() && "LoopList should initially be empty!");
236 Loop *CurrentLoop = &L;
237 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
238 while (!Vec->empty()) {
239 // The current loop has multiple subloops in it hence it is not tightly
240 // nested.
241 // Discard all loops above it added into Worklist.
242 if (Vec->size() != 1) {
243 LoopList = {};
244 return;
245 }
246
247 LoopList.push_back(CurrentLoop);
248 CurrentLoop = Vec->front();
249 Vec = &CurrentLoop->getSubLoops();
250 }
251 LoopList.push_back(CurrentLoop);
252}
253
256 unsigned LoopNestDepth = LoopList.size();
257 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
258 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
259 << ", the supported range is [" << MinLoopNestDepth
260 << ", " << MaxLoopNestDepth << "].\n");
261 Loop **OuterLoop = LoopList.begin();
262 ORE.emit([&]() {
263 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
264 (*OuterLoop)->getStartLoc(),
265 (*OuterLoop)->getHeader())
266 << "Unsupported depth of loop nest, the supported range is ["
267 << std::to_string(MinLoopNestDepth) << ", "
268 << std::to_string(MaxLoopNestDepth) << "].\n";
269 });
270 return false;
271 }
272 return true;
273}
274namespace {
275
276/// LoopInterchangeLegality checks if it is legal to interchange the loop.
277class LoopInterchangeLegality {
278public:
279 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
281 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
282
283 /// Check if the loops can be interchanged.
284 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
285 CharMatrix &DepMatrix);
286
287 /// Discover induction PHIs in the header of \p L. Induction
288 /// PHIs are added to \p Inductions.
289 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
290
291 /// Check if the loop structure is understood. We do not handle triangular
292 /// loops for now.
293 bool isLoopStructureUnderstood();
294
295 bool currentLimitations();
296
297 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
298 return OuterInnerReductions;
299 }
300
301 const SmallVectorImpl<PHINode *> &getInnerLoopInductions() const {
302 return InnerLoopInductions;
303 }
304
305private:
306 bool tightlyNested(Loop *Outer, Loop *Inner);
307 bool containsUnsafeInstructions(BasicBlock *BB);
308
309 /// Discover induction and reduction PHIs in the header of \p L. Induction
310 /// PHIs are added to \p Inductions, reductions are added to
311 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
312 /// to be passed as \p InnerLoop.
313 bool findInductionAndReductions(Loop *L,
314 SmallVector<PHINode *, 8> &Inductions,
315 Loop *InnerLoop);
316
317 Loop *OuterLoop;
318 Loop *InnerLoop;
319
320 ScalarEvolution *SE;
321
322 /// Interface to emit optimization remarks.
324
325 /// Set of reduction PHIs taking part of a reduction across the inner and
326 /// outer loop.
327 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
328
329 /// Set of inner loop induction PHIs
330 SmallVector<PHINode *, 8> InnerLoopInductions;
331};
332
333/// LoopInterchangeProfitability checks if it is profitable to interchange the
334/// loop.
335class LoopInterchangeProfitability {
336public:
337 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
339 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
340
341 /// Check if the loop interchange is profitable.
342 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
343 unsigned InnerLoopId, unsigned OuterLoopId,
344 CharMatrix &DepMatrix,
346 std::unique_ptr<CacheCost> &CC);
347
348private:
349 int getInstrOrderCost();
350 std::optional<bool> isProfitablePerLoopCacheAnalysis(
352 std::unique_ptr<CacheCost> &CC);
353 std::optional<bool> isProfitablePerInstrOrderCost();
354 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
355 unsigned OuterLoopId,
356 CharMatrix &DepMatrix);
357 Loop *OuterLoop;
358 Loop *InnerLoop;
359
360 /// Scev analysis.
361 ScalarEvolution *SE;
362
363 /// Interface to emit optimization remarks.
365};
366
367/// LoopInterchangeTransform interchanges the loop.
368class LoopInterchangeTransform {
369public:
370 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
371 LoopInfo *LI, DominatorTree *DT,
372 const LoopInterchangeLegality &LIL)
373 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
374
375 /// Interchange OuterLoop and InnerLoop.
376 bool transform();
377 void restructureLoops(Loop *NewInner, Loop *NewOuter,
378 BasicBlock *OrigInnerPreHeader,
379 BasicBlock *OrigOuterPreHeader);
380 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
381
382private:
383 bool adjustLoopLinks();
384 bool adjustLoopBranches();
385
386 Loop *OuterLoop;
387 Loop *InnerLoop;
388
389 /// Scev analysis.
390 ScalarEvolution *SE;
391
392 LoopInfo *LI;
393 DominatorTree *DT;
394
395 const LoopInterchangeLegality &LIL;
396};
397
398struct LoopInterchange {
399 ScalarEvolution *SE = nullptr;
400 LoopInfo *LI = nullptr;
401 DependenceInfo *DI = nullptr;
402 DominatorTree *DT = nullptr;
403 std::unique_ptr<CacheCost> CC = nullptr;
404
405 /// Interface to emit optimization remarks.
407
408 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
409 DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
411 : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}
412
413 bool run(Loop *L) {
414 if (L->getParentLoop())
415 return false;
416 SmallVector<Loop *, 8> LoopList;
417 populateWorklist(*L, LoopList);
418 return processLoopList(LoopList);
419 }
420
421 bool run(LoopNest &LN) {
422 SmallVector<Loop *, 8> LoopList(LN.getLoops());
423 for (unsigned I = 1; I < LoopList.size(); ++I)
424 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
425 return false;
426 return processLoopList(LoopList);
427 }
428
429 bool isComputableLoopNest(ArrayRef<Loop *> LoopList) {
430 for (Loop *L : LoopList) {
431 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
432 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
433 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
434 return false;
435 }
436 if (L->getNumBackEdges() != 1) {
437 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
438 return false;
439 }
440 if (!L->getExitingBlock()) {
441 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
442 return false;
443 }
444 }
445 return true;
446 }
447
448 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
449 // TODO: Add a better heuristic to select the loop to be interchanged based
450 // on the dependence matrix. Currently we select the innermost loop.
451 return LoopList.size() - 1;
452 }
453
454 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
455 bool Changed = false;
456
457 // Ensure proper loop nest depth.
458 assert(hasSupportedLoopDepth(LoopList, *ORE) &&
459 "Unsupported depth of loop nest.");
460
461 unsigned LoopNestDepth = LoopList.size();
462 if (!isComputableLoopNest(LoopList)) {
463 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
464 return false;
465 }
466
467 LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
468 << "\n");
469
470 CharMatrix DependencyMatrix;
471 Loop *OuterMostLoop = *(LoopList.begin());
472 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
473 OuterMostLoop, DI, SE, ORE)) {
474 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
475 return false;
476 }
477
478 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
479 printDepMatrix(DependencyMatrix));
480
481 // Get the Outermost loop exit.
482 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
483 if (!LoopNestExit) {
484 LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
485 return false;
486 }
487
488 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
489 // Obtain the loop vector returned from loop cache analysis beforehand,
490 // and put each <Loop, index> pair into a map for constant time query
491 // later. Indices in loop vector reprsent the optimal order of the
492 // corresponding loop, e.g., given a loopnest with depth N, index 0
493 // indicates the loop should be placed as the outermost loop and index N
494 // indicates the loop should be placed as the innermost loop.
495 //
496 // For the old pass manager CacheCost would be null.
498 if (CC != nullptr) {
499 const auto &LoopCosts = CC->getLoopCosts();
500 for (unsigned i = 0; i < LoopCosts.size(); i++) {
501 CostMap[LoopCosts[i].first] = i;
502 }
503 }
504 // We try to achieve the globally optimal memory access for the loopnest,
505 // and do interchange based on a bubble-sort fasion. We start from
506 // the innermost loop, move it outwards to the best possible position
507 // and repeat this process.
508 for (unsigned j = SelecLoopId; j > 0; j--) {
509 bool ChangedPerIter = false;
510 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
511 bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,
512 DependencyMatrix, CostMap);
513 if (!Interchanged)
514 continue;
515 // Loops interchanged, update LoopList accordingly.
516 std::swap(LoopList[i - 1], LoopList[i]);
517 // Update the DependencyMatrix
518 interChangeDependencies(DependencyMatrix, i, i - 1);
519
520 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
521 printDepMatrix(DependencyMatrix));
522
523 ChangedPerIter |= Interchanged;
524 Changed |= Interchanged;
525 }
526 // Early abort if there was no interchange during an entire round of
527 // moving loops outwards.
528 if (!ChangedPerIter)
529 break;
530 }
531 return Changed;
532 }
533
534 bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
535 unsigned OuterLoopId,
536 std::vector<std::vector<char>> &DependencyMatrix,
537 const DenseMap<const Loop *, unsigned> &CostMap) {
538 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
539 << " and OuterLoopId = " << OuterLoopId << "\n");
540 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
541 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
542 LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
543 return false;
544 }
545 LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
546 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
547 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
548 DependencyMatrix, CostMap, CC)) {
549 LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
550 return false;
551 }
552
553 ORE->emit([&]() {
554 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
555 InnerLoop->getStartLoc(),
556 InnerLoop->getHeader())
557 << "Loop interchanged with enclosing loop.";
558 });
559
560 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
561 LIT.transform();
562 LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
563 LoopsInterchanged++;
564
565 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
566 return true;
567 }
568};
569
570} // end anonymous namespace
571
572bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
573 return any_of(*BB, [](const Instruction &I) {
574 return I.mayHaveSideEffects() || I.mayReadFromMemory();
575 });
576}
577
578bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
579 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
580 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
581 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
582
583 LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
584
585 // A perfectly nested loop will not have any branch in between the outer and
586 // inner block i.e. outer header will branch to either inner preheader and
587 // outerloop latch.
588 BranchInst *OuterLoopHeaderBI =
589 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
590 if (!OuterLoopHeaderBI)
591 return false;
592
593 for (BasicBlock *Succ : successors(OuterLoopHeaderBI))
594 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
595 Succ != OuterLoopLatch)
596 return false;
597
598 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
599 // We do not have any basic block in between now make sure the outer header
600 // and outer loop latch doesn't contain any unsafe instructions.
601 if (containsUnsafeInstructions(OuterLoopHeader) ||
602 containsUnsafeInstructions(OuterLoopLatch))
603 return false;
604
605 // Also make sure the inner loop preheader does not contain any unsafe
606 // instructions. Note that all instructions in the preheader will be moved to
607 // the outer loop header when interchanging.
608 if (InnerLoopPreHeader != OuterLoopHeader &&
609 containsUnsafeInstructions(InnerLoopPreHeader))
610 return false;
611
612 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
613 // Ensure the inner loop exit block flows to the outer loop latch possibly
614 // through empty blocks.
615 const BasicBlock &SuccInner =
616 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
617 if (&SuccInner != OuterLoopLatch) {
618 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
619 << " does not lead to the outer loop latch.\n";);
620 return false;
621 }
622 // The inner loop exit block does flow to the outer loop latch and not some
623 // other BBs, now make sure it contains safe instructions, since it will be
624 // moved into the (new) inner loop after interchange.
625 if (containsUnsafeInstructions(InnerLoopExit))
626 return false;
627
628 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
629 // We have a perfect loop nest.
630 return true;
631}
632
633bool LoopInterchangeLegality::isLoopStructureUnderstood() {
634 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
635 for (PHINode *InnerInduction : InnerLoopInductions) {
636 unsigned Num = InnerInduction->getNumOperands();
637 for (unsigned i = 0; i < Num; ++i) {
638 Value *Val = InnerInduction->getOperand(i);
639 if (isa<Constant>(Val))
640 continue;
641 Instruction *I = dyn_cast<Instruction>(Val);
642 if (!I)
643 return false;
644 // TODO: Handle triangular loops.
645 // e.g. for(int i=0;i<N;i++)
646 // for(int j=i;j<N;j++)
647 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
648 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
649 InnerLoopPreheader &&
650 !OuterLoop->isLoopInvariant(I)) {
651 return false;
652 }
653 }
654 }
655
656 // TODO: Handle triangular loops of another form.
657 // e.g. for(int i=0;i<N;i++)
658 // for(int j=0;j<i;j++)
659 // or,
660 // for(int i=0;i<N;i++)
661 // for(int j=0;j*i<N;j++)
662 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
663 BranchInst *InnerLoopLatchBI =
664 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
665 if (!InnerLoopLatchBI->isConditional())
666 return false;
667 if (CmpInst *InnerLoopCmp =
668 dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {
669 Value *Op0 = InnerLoopCmp->getOperand(0);
670 Value *Op1 = InnerLoopCmp->getOperand(1);
671
672 // LHS and RHS of the inner loop exit condition, e.g.,
673 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
674 Value *Left = nullptr;
675 Value *Right = nullptr;
676
677 // Check if V only involves inner loop induction variable.
678 // Return true if V is InnerInduction, or a cast from
679 // InnerInduction, or a binary operator that involves
680 // InnerInduction and a constant.
681 std::function<bool(Value *)> IsPathToInnerIndVar;
682 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
683 if (llvm::is_contained(InnerLoopInductions, V))
684 return true;
685 if (isa<Constant>(V))
686 return true;
687 const Instruction *I = dyn_cast<Instruction>(V);
688 if (!I)
689 return false;
690 if (isa<CastInst>(I))
691 return IsPathToInnerIndVar(I->getOperand(0));
692 if (isa<BinaryOperator>(I))
693 return IsPathToInnerIndVar(I->getOperand(0)) &&
694 IsPathToInnerIndVar(I->getOperand(1));
695 return false;
696 };
697
698 // In case of multiple inner loop indvars, it is okay if LHS and RHS
699 // are both inner indvar related variables.
700 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
701 return true;
702
703 // Otherwise we check if the cmp instruction compares an inner indvar
704 // related variable (Left) with a outer loop invariant (Right).
705 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
706 Left = Op0;
707 Right = Op1;
708 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
709 Left = Op1;
710 Right = Op0;
711 }
712
713 if (Left == nullptr)
714 return false;
715
716 const SCEV *S = SE->getSCEV(Right);
717 if (!SE->isLoopInvariant(S, OuterLoop))
718 return false;
719 }
720
721 return true;
722}
723
724// If SV is a LCSSA PHI node with a single incoming value, return the incoming
725// value.
726static Value *followLCSSA(Value *SV) {
727 PHINode *PHI = dyn_cast<PHINode>(SV);
728 if (!PHI)
729 return SV;
730
731 if (PHI->getNumIncomingValues() != 1)
732 return SV;
733 return followLCSSA(PHI->getIncomingValue(0));
734}
735
736// Check V's users to see if it is involved in a reduction in L.
738 // Reduction variables cannot be constants.
739 if (isa<Constant>(V))
740 return nullptr;
741
742 for (Value *User : V->users()) {
743 if (PHINode *PHI = dyn_cast<PHINode>(User)) {
744 if (PHI->getNumIncomingValues() == 1)
745 continue;
748 // Detect floating point reduction only when it can be reordered.
749 if (RD.getExactFPMathInst() != nullptr)
750 return nullptr;
751 return PHI;
752 }
753 return nullptr;
754 }
755 }
756
757 return nullptr;
758}
759
760bool LoopInterchangeLegality::findInductionAndReductions(
761 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
762 if (!L->getLoopLatch() || !L->getLoopPredecessor())
763 return false;
764 for (PHINode &PHI : L->getHeader()->phis()) {
767 Inductions.push_back(&PHI);
768 else {
769 // PHIs in inner loops need to be part of a reduction in the outer loop,
770 // discovered when checking the PHIs of the outer loop earlier.
771 if (!InnerLoop) {
772 if (!OuterInnerReductions.count(&PHI)) {
773 LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
774 "across the outer loop.\n");
775 return false;
776 }
777 } else {
778 assert(PHI.getNumIncomingValues() == 2 &&
779 "Phis in loop header should have exactly 2 incoming values");
780 // Check if we have a PHI node in the outer loop that has a reduction
781 // result from the inner loop as an incoming value.
782 Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
783 PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
784 if (!InnerRedPhi ||
785 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
787 dbgs()
788 << "Failed to recognize PHI as an induction or reduction.\n");
789 return false;
790 }
791 OuterInnerReductions.insert(&PHI);
792 OuterInnerReductions.insert(InnerRedPhi);
793 }
794 }
795 }
796 return true;
797}
798
799// This function indicates the current limitations in the transform as a result
800// of which we do not proceed.
801bool LoopInterchangeLegality::currentLimitations() {
802 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
803
804 // transform currently expects the loop latches to also be the exiting
805 // blocks.
806 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
807 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
808 !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
809 !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
811 dbgs() << "Loops where the latch is not the exiting block are not"
812 << " supported currently.\n");
813 ORE->emit([&]() {
814 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
815 OuterLoop->getStartLoc(),
816 OuterLoop->getHeader())
817 << "Loops where the latch is not the exiting block cannot be"
818 " interchange currently.";
819 });
820 return true;
821 }
822
823 SmallVector<PHINode *, 8> Inductions;
824 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
826 dbgs() << "Only outer loops with induction or reduction PHI nodes "
827 << "are supported currently.\n");
828 ORE->emit([&]() {
829 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
830 OuterLoop->getStartLoc(),
831 OuterLoop->getHeader())
832 << "Only outer loops with induction or reduction PHI nodes can be"
833 " interchanged currently.";
834 });
835 return true;
836 }
837
838 Inductions.clear();
839 // For multi-level loop nests, make sure that all phi nodes for inner loops
840 // at all levels can be recognized as a induction or reduction phi. Bail out
841 // if a phi node at a certain nesting level cannot be properly recognized.
842 Loop *CurLevelLoop = OuterLoop;
843 while (!CurLevelLoop->getSubLoops().empty()) {
844 // We already made sure that the loop nest is tightly nested.
845 CurLevelLoop = CurLevelLoop->getSubLoops().front();
846 if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) {
848 dbgs() << "Only inner loops with induction or reduction PHI nodes "
849 << "are supported currently.\n");
850 ORE->emit([&]() {
851 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
852 CurLevelLoop->getStartLoc(),
853 CurLevelLoop->getHeader())
854 << "Only inner loops with induction or reduction PHI nodes can be"
855 " interchange currently.";
856 });
857 return true;
858 }
859 }
860
861 // TODO: Triangular loops are not handled for now.
862 if (!isLoopStructureUnderstood()) {
863 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
864 ORE->emit([&]() {
865 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
866 InnerLoop->getStartLoc(),
867 InnerLoop->getHeader())
868 << "Inner loop structure not understood currently.";
869 });
870 return true;
871 }
872
873 return false;
874}
875
876bool LoopInterchangeLegality::findInductions(
877 Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
878 for (PHINode &PHI : L->getHeader()->phis()) {
881 Inductions.push_back(&PHI);
882 }
883 return !Inductions.empty();
884}
885
886// We currently only support LCSSA PHI nodes in the inner loop exit, if their
887// users are either reduction PHIs or PHIs outside the outer loop (which means
888// the we are only interested in the final value after the loop).
889static bool
892 BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
893 for (PHINode &PHI : InnerExit->phis()) {
894 // Reduction lcssa phi will have only 1 incoming block that from loop latch.
895 if (PHI.getNumIncomingValues() > 1)
896 return false;
897 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
898 PHINode *PN = dyn_cast<PHINode>(U);
899 return !PN ||
900 (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
901 })) {
902 return false;
903 }
904 }
905 return true;
906}
907
908// We currently support LCSSA PHI nodes in the outer loop exit, if their
909// incoming values do not come from the outer loop latch or if the
910// outer loop latch has a single predecessor. In that case, the value will
911// be available if both the inner and outer loop conditions are true, which
912// will still be true after interchanging. If we have multiple predecessor,
913// that may not be the case, e.g. because the outer loop latch may be executed
914// if the inner loop is not executed.
915static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
916 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
917 for (PHINode &PHI : LoopNestExit->phis()) {
918 for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
919 Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
920 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
921 continue;
922
923 // The incoming value is defined in the outer loop latch. Currently we
924 // only support that in case the outer loop latch has a single predecessor.
925 // This guarantees that the outer loop latch is executed if and only if
926 // the inner loop is executed (because tightlyNested() guarantees that the
927 // outer loop header only branches to the inner loop or the outer loop
928 // latch).
929 // FIXME: We could weaken this logic and allow multiple predecessors,
930 // if the values are produced outside the loop latch. We would need
931 // additional logic to update the PHI nodes in the exit block as
932 // well.
933 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
934 return false;
935 }
936 }
937 return true;
938}
939
940// In case of multi-level nested loops, it may occur that lcssa phis exist in
941// the latch of InnerLoop, i.e., when defs of the incoming values are further
942// inside the loopnest. Sometimes those incoming values are not available
943// after interchange, since the original inner latch will become the new outer
944// latch which may have predecessor paths that do not include those incoming
945// values.
946// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
947// multi-level loop nests.
948static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
949 if (InnerLoop->getSubLoops().empty())
950 return true;
951 // If the original outer latch has only one predecessor, then values defined
952 // further inside the looploop, e.g., in the innermost loop, will be available
953 // at the new outer latch after interchange.
954 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
955 return true;
956
957 // The outer latch has more than one predecessors, i.e., the inner
958 // exit and the inner header.
959 // PHI nodes in the inner latch are lcssa phis where the incoming values
960 // are defined further inside the loopnest. Check if those phis are used
961 // in the original inner latch. If that is the case then bail out since
962 // those incoming values may not be available at the new outer latch.
963 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
964 for (PHINode &PHI : InnerLoopLatch->phis()) {
965 for (auto *U : PHI.users()) {
966 Instruction *UI = cast<Instruction>(U);
967 if (InnerLoopLatch == UI->getParent())
968 return false;
969 }
970 }
971 return true;
972}
973
974bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
975 unsigned OuterLoopId,
976 CharMatrix &DepMatrix) {
977 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
978 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
979 << " and OuterLoopId = " << OuterLoopId
980 << " due to dependence\n");
981 ORE->emit([&]() {
982 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
983 InnerLoop->getStartLoc(),
984 InnerLoop->getHeader())
985 << "Cannot interchange loops due to dependences.";
986 });
987 return false;
988 }
989 // Check if outer and inner loop contain legal instructions only.
990 for (auto *BB : OuterLoop->blocks())
992 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
993 // readnone functions do not prevent interchanging.
994 if (CI->onlyWritesMemory())
995 continue;
997 dbgs() << "Loops with call instructions cannot be interchanged "
998 << "safely.");
999 ORE->emit([&]() {
1000 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
1001 CI->getDebugLoc(),
1002 CI->getParent())
1003 << "Cannot interchange loops due to call instruction.";
1004 });
1005
1006 return false;
1007 }
1008
1009 if (!findInductions(InnerLoop, InnerLoopInductions)) {
1010 LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");
1011 return false;
1012 }
1013
1014 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1015 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1016 ORE->emit([&]() {
1017 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1018 InnerLoop->getStartLoc(),
1019 InnerLoop->getHeader())
1020 << "Cannot interchange loops because unsupported PHI nodes found "
1021 "in inner loop latch.";
1022 });
1023 return false;
1024 }
1025
1026 // TODO: The loops could not be interchanged due to current limitations in the
1027 // transform module.
1028 if (currentLimitations()) {
1029 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1030 return false;
1031 }
1032
1033 // Check if the loops are tightly nested.
1034 if (!tightlyNested(OuterLoop, InnerLoop)) {
1035 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1036 ORE->emit([&]() {
1037 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1038 InnerLoop->getStartLoc(),
1039 InnerLoop->getHeader())
1040 << "Cannot interchange loops because they are not tightly "
1041 "nested.";
1042 });
1043 return false;
1044 }
1045
1046 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
1047 OuterInnerReductions)) {
1048 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1049 ORE->emit([&]() {
1050 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1051 InnerLoop->getStartLoc(),
1052 InnerLoop->getHeader())
1053 << "Found unsupported PHI node in loop exit.";
1054 });
1055 return false;
1056 }
1057
1058 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1059 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1060 ORE->emit([&]() {
1061 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1062 OuterLoop->getStartLoc(),
1063 OuterLoop->getHeader())
1064 << "Found unsupported PHI node in loop exit.";
1065 });
1066 return false;
1067 }
1068
1069 return true;
1070}
1071
1072int LoopInterchangeProfitability::getInstrOrderCost() {
1073 unsigned GoodOrder, BadOrder;
1074 BadOrder = GoodOrder = 0;
1075 for (BasicBlock *BB : InnerLoop->blocks()) {
1076 for (Instruction &Ins : *BB) {
1077 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
1078 unsigned NumOp = GEP->getNumOperands();
1079 bool FoundInnerInduction = false;
1080 bool FoundOuterInduction = false;
1081 for (unsigned i = 0; i < NumOp; ++i) {
1082 // Skip operands that are not SCEV-able.
1083 if (!SE->isSCEVable(GEP->getOperand(i)->getType()))
1084 continue;
1085
1086 const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
1087 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1088 if (!AR)
1089 continue;
1090
1091 // If we find the inner induction after an outer induction e.g.
1092 // for(int i=0;i<N;i++)
1093 // for(int j=0;j<N;j++)
1094 // A[i][j] = A[i-1][j-1]+k;
1095 // then it is a good order.
1096 if (AR->getLoop() == InnerLoop) {
1097 // We found an InnerLoop induction after OuterLoop induction. It is
1098 // a good order.
1099 FoundInnerInduction = true;
1100 if (FoundOuterInduction) {
1101 GoodOrder++;
1102 break;
1103 }
1104 }
1105 // If we find the outer induction after an inner induction e.g.
1106 // for(int i=0;i<N;i++)
1107 // for(int j=0;j<N;j++)
1108 // A[j][i] = A[j-1][i-1]+k;
1109 // then it is a bad order.
1110 if (AR->getLoop() == OuterLoop) {
1111 // We found an OuterLoop induction after InnerLoop induction. It is
1112 // a bad order.
1113 FoundOuterInduction = true;
1114 if (FoundInnerInduction) {
1115 BadOrder++;
1116 break;
1117 }
1118 }
1119 }
1120 }
1121 }
1122 }
1123 return GoodOrder - BadOrder;
1124}
1125
1126std::optional<bool>
1127LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1128 const DenseMap<const Loop *, unsigned> &CostMap,
1129 std::unique_ptr<CacheCost> &CC) {
1130 // This is the new cost model returned from loop cache analysis.
1131 // A smaller index means the loop should be placed an outer loop, and vice
1132 // versa.
1133 if (CostMap.contains(InnerLoop) && CostMap.contains(OuterLoop)) {
1134 unsigned InnerIndex = 0, OuterIndex = 0;
1135 InnerIndex = CostMap.find(InnerLoop)->second;
1136 OuterIndex = CostMap.find(OuterLoop)->second;
1137 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1138 << ", OuterIndex = " << OuterIndex << "\n");
1139 if (InnerIndex < OuterIndex)
1140 return std::optional<bool>(true);
1141 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1142 "numbers to each loop");
1143 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1144 return std::nullopt;
1145 return std::optional<bool>(false);
1146 }
1147 return std::nullopt;
1148}
1149
1150std::optional<bool>
1151LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1152 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1153 // good and bad order of induction variables in the instruction and allows
1154 // reordering if number of bad orders is more than good.
1155 int Cost = getInstrOrderCost();
1156 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1158 return std::optional<bool>(true);
1159
1160 return std::nullopt;
1161}
1162
1163std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1164 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1165 for (auto &Row : DepMatrix) {
1166 // If the inner loop is loop independent or doesn't carry any dependency
1167 // it is not profitable to move this to outer position, since we are
1168 // likely able to do inner loop vectorization already.
1169 if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=')
1170 return std::optional<bool>(false);
1171
1172 // If the outer loop is not loop independent it is not profitable to move
1173 // this to inner position, since doing so would not enable inner loop
1174 // parallelism.
1175 if (Row[OuterLoopId] != 'I' && Row[OuterLoopId] != '=')
1176 return std::optional<bool>(false);
1177 }
1178 // If inner loop has dependence and outer loop is loop independent then it
1179 // is/ profitable to interchange to enable inner loop parallelism.
1180 // If there are no dependences, interchanging will not improve anything.
1181 return std::optional<bool>(!DepMatrix.empty());
1182}
1183
1184bool LoopInterchangeProfitability::isProfitable(
1185 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1186 unsigned OuterLoopId, CharMatrix &DepMatrix,
1187 const DenseMap<const Loop *, unsigned> &CostMap,
1188 std::unique_ptr<CacheCost> &CC) {
1189 // isProfitable() is structured to avoid endless loop interchange.
1190 // If loop cache analysis could decide the profitability then,
1191 // profitability check will stop and return the analysis result.
1192 // If cache analysis failed to analyze the loopnest (e.g.,
1193 // due to delinearization issues) then only check whether it is
1194 // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
1195 // analysis the profitability then only, isProfitableForVectorization
1196 // will decide.
1197 std::optional<bool> shouldInterchange =
1198 isProfitablePerLoopCacheAnalysis(CostMap, CC);
1199 if (!shouldInterchange.has_value()) {
1200 shouldInterchange = isProfitablePerInstrOrderCost();
1201 if (!shouldInterchange.has_value())
1202 shouldInterchange =
1203 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1204 }
1205 if (!shouldInterchange.has_value()) {
1206 ORE->emit([&]() {
1207 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1208 InnerLoop->getStartLoc(),
1209 InnerLoop->getHeader())
1210 << "Insufficient information to calculate the cost of loop for "
1211 "interchange.";
1212 });
1213 return false;
1214 } else if (!shouldInterchange.value()) {
1215 ORE->emit([&]() {
1216 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1217 InnerLoop->getStartLoc(),
1218 InnerLoop->getHeader())
1219 << "Interchanging loops is not considered to improve cache "
1220 "locality nor vectorization.";
1221 });
1222 return false;
1223 }
1224 return true;
1225}
1226
1227void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1228 Loop *InnerLoop) {
1229 for (Loop *L : *OuterLoop)
1230 if (L == InnerLoop) {
1231 OuterLoop->removeChildLoop(L);
1232 return;
1233 }
1234 llvm_unreachable("Couldn't find loop");
1235}
1236
1237/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1238/// new inner and outer loop after interchanging: NewInner is the original
1239/// outer loop and NewOuter is the original inner loop.
1240///
1241/// Before interchanging, we have the following structure
1242/// Outer preheader
1243// Outer header
1244// Inner preheader
1245// Inner header
1246// Inner body
1247// Inner latch
1248// outer bbs
1249// Outer latch
1250//
1251// After interchanging:
1252// Inner preheader
1253// Inner header
1254// Outer preheader
1255// Outer header
1256// Inner body
1257// outer bbs
1258// Outer latch
1259// Inner latch
1260void LoopInterchangeTransform::restructureLoops(
1261 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1262 BasicBlock *OrigOuterPreHeader) {
1263 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1264 // The original inner loop preheader moves from the new inner loop to
1265 // the parent loop, if there is one.
1266 NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1267 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1268
1269 // Switch the loop levels.
1270 if (OuterLoopParent) {
1271 // Remove the loop from its parent loop.
1272 removeChildLoop(OuterLoopParent, NewInner);
1273 removeChildLoop(NewInner, NewOuter);
1274 OuterLoopParent->addChildLoop(NewOuter);
1275 } else {
1276 removeChildLoop(NewInner, NewOuter);
1277 LI->changeTopLevelLoop(NewInner, NewOuter);
1278 }
1279 while (!NewOuter->isInnermost())
1280 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1281 NewOuter->addChildLoop(NewInner);
1282
1283 // BBs from the original inner loop.
1284 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1285
1286 // Add BBs from the original outer loop to the original inner loop (excluding
1287 // BBs already in inner loop)
1288 for (BasicBlock *BB : NewInner->blocks())
1289 if (LI->getLoopFor(BB) == NewInner)
1290 NewOuter->addBlockEntry(BB);
1291
1292 // Now remove inner loop header and latch from the new inner loop and move
1293 // other BBs (the loop body) to the new inner loop.
1294 BasicBlock *OuterHeader = NewOuter->getHeader();
1295 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1296 for (BasicBlock *BB : OrigInnerBBs) {
1297 // Nothing will change for BBs in child loops.
1298 if (LI->getLoopFor(BB) != NewOuter)
1299 continue;
1300 // Remove the new outer loop header and latch from the new inner loop.
1301 if (BB == OuterHeader || BB == OuterLatch)
1302 NewInner->removeBlockFromLoop(BB);
1303 else
1304 LI->changeLoopFor(BB, NewInner);
1305 }
1306
1307 // The preheader of the original outer loop becomes part of the new
1308 // outer loop.
1309 NewOuter->addBlockEntry(OrigOuterPreHeader);
1310 LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1311
1312 // Tell SE that we move the loops around.
1313 SE->forgetLoop(NewOuter);
1314}
1315
1316bool LoopInterchangeTransform::transform() {
1317 bool Transformed = false;
1318
1319 if (InnerLoop->getSubLoops().empty()) {
1320 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1321 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
1322 auto &InductionPHIs = LIL.getInnerLoopInductions();
1323 if (InductionPHIs.empty()) {
1324 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1325 return false;
1326 }
1327
1328 SmallVector<Instruction *, 8> InnerIndexVarList;
1329 for (PHINode *CurInductionPHI : InductionPHIs) {
1330 if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1331 InnerIndexVarList.push_back(
1332 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));
1333 else
1334 InnerIndexVarList.push_back(
1335 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));
1336 }
1337
1338 // Create a new latch block for the inner loop. We split at the
1339 // current latch's terminator and then move the condition and all
1340 // operands that are not either loop-invariant or the induction PHI into the
1341 // new latch block.
1342 BasicBlock *NewLatch =
1343 SplitBlock(InnerLoop->getLoopLatch(),
1344 InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1345
1347 unsigned i = 0;
1348 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
1349 for (; i < WorkList.size(); i++) {
1350 // Duplicate instruction and move it the new latch. Update uses that
1351 // have been moved.
1352 Instruction *NewI = WorkList[i]->clone();
1353 NewI->insertBefore(NewLatch->getFirstNonPHI());
1354 assert(!NewI->mayHaveSideEffects() &&
1355 "Moving instructions with side-effects may change behavior of "
1356 "the loop nest!");
1357 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
1358 Instruction *UserI = cast<Instruction>(U.getUser());
1359 if (!InnerLoop->contains(UserI->getParent()) ||
1360 UserI->getParent() == NewLatch ||
1361 llvm::is_contained(InductionPHIs, UserI))
1362 U.set(NewI);
1363 }
1364 // Add operands of moved instruction to the worklist, except if they are
1365 // outside the inner loop or are the induction PHI.
1366 for (Value *Op : WorkList[i]->operands()) {
1367 Instruction *OpI = dyn_cast<Instruction>(Op);
1368 if (!OpI ||
1369 this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
1370 llvm::is_contained(InductionPHIs, OpI))
1371 continue;
1372 WorkList.insert(OpI);
1373 }
1374 }
1375 };
1376
1377 // FIXME: Should we interchange when we have a constant condition?
1378 Instruction *CondI = dyn_cast<Instruction>(
1379 cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator())
1380 ->getCondition());
1381 if (CondI)
1382 WorkList.insert(CondI);
1383 MoveInstructions();
1384 for (Instruction *InnerIndexVar : InnerIndexVarList)
1385 WorkList.insert(cast<Instruction>(InnerIndexVar));
1386 MoveInstructions();
1387 }
1388
1389 // Ensure the inner loop phi nodes have a separate basic block.
1390 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1391 if (InnerLoopHeader->getFirstNonPHI() != InnerLoopHeader->getTerminator()) {
1392 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
1393 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
1394 }
1395
1396 // Instructions in the original inner loop preheader may depend on values
1397 // defined in the outer loop header. Move them there, because the original
1398 // inner loop preheader will become the entry into the interchanged loop nest.
1399 // Currently we move all instructions and rely on LICM to move invariant
1400 // instructions outside the loop nest.
1401 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1402 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1403 if (InnerLoopPreHeader != OuterLoopHeader) {
1405 for (Instruction &I :
1406 make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
1407 std::prev(InnerLoopPreHeader->end()))))
1408 I.moveBeforePreserving(OuterLoopHeader->getTerminator());
1409 }
1410
1411 Transformed |= adjustLoopLinks();
1412 if (!Transformed) {
1413 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
1414 return false;
1415 }
1416
1417 return true;
1418}
1419
1420/// \brief Move all instructions except the terminator from FromBB right before
1421/// InsertBefore
1422static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
1423 BasicBlock *ToBB = InsertBefore->getParent();
1424
1425 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),
1426 FromBB->getTerminator()->getIterator());
1427}
1428
1429/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
1430static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
1431 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
1432 // from BB1 afterwards.
1433 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
1434 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
1435 for (Instruction *I : TempInstrs)
1436 I->removeFromParent();
1437
1438 // Move instructions from BB2 to BB1.
1439 moveBBContents(BB2, BB1->getTerminator());
1440
1441 // Move instructions from TempInstrs to BB2.
1442 for (Instruction *I : TempInstrs)
1443 I->insertBefore(BB2->getTerminator());
1444}
1445
1446// Update BI to jump to NewBB instead of OldBB. Records updates to the
1447// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
1448// \p OldBB is exactly once in BI's successor list.
1449static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
1450 BasicBlock *NewBB,
1451 std::vector<DominatorTree::UpdateType> &DTUpdates,
1452 bool MustUpdateOnce = true) {
1453 assert((!MustUpdateOnce ||
1455 [OldBB](BasicBlock *BB) {
1456 return BB == OldBB;
1457 }) == 1) && "BI must jump to OldBB exactly once.");
1458 bool Changed = false;
1459 for (Use &Op : BI->operands())
1460 if (Op == OldBB) {
1461 Op.set(NewBB);
1462 Changed = true;
1463 }
1464
1465 if (Changed) {
1466 DTUpdates.push_back(
1467 {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
1468 DTUpdates.push_back(
1469 {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
1470 }
1471 assert(Changed && "Expected a successor to be updated");
1472}
1473
1474// Move Lcssa PHIs to the right place.
1475static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
1476 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
1477 BasicBlock *OuterLatch, BasicBlock *OuterExit,
1478 Loop *InnerLoop, LoopInfo *LI) {
1479
1480 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
1481 // defined either in the header or latch. Those blocks will become header and
1482 // latch of the new outer loop, and the only possible users can PHI nodes
1483 // in the exit block of the loop nest or the outer loop header (reduction
1484 // PHIs, in that case, the incoming value must be defined in the inner loop
1485 // header). We can just substitute the user with the incoming value and remove
1486 // the PHI.
1487 for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
1488 assert(P.getNumIncomingValues() == 1 &&
1489 "Only loops with a single exit are supported!");
1490
1491 // Incoming values are guaranteed be instructions currently.
1492 auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
1493 // In case of multi-level nested loops, follow LCSSA to find the incoming
1494 // value defined from the innermost loop.
1495 auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
1496 // Skip phis with incoming values from the inner loop body, excluding the
1497 // header and latch.
1498 if (IncIInnerMost->getParent() != InnerLatch &&
1499 IncIInnerMost->getParent() != InnerHeader)
1500 continue;
1501
1502 assert(all_of(P.users(),
1503 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
1504 return (cast<PHINode>(U)->getParent() == OuterHeader &&
1505 IncI->getParent() == InnerHeader) ||
1506 cast<PHINode>(U)->getParent() == OuterExit;
1507 }) &&
1508 "Can only replace phis iff the uses are in the loop nest exit or "
1509 "the incoming value is defined in the inner header (it will "
1510 "dominate all loop blocks after interchanging)");
1511 P.replaceAllUsesWith(IncI);
1512 P.eraseFromParent();
1513 }
1514
1515 SmallVector<PHINode *, 8> LcssaInnerExit;
1516 for (PHINode &P : InnerExit->phis())
1517 LcssaInnerExit.push_back(&P);
1518
1519 SmallVector<PHINode *, 8> LcssaInnerLatch;
1520 for (PHINode &P : InnerLatch->phis())
1521 LcssaInnerLatch.push_back(&P);
1522
1523 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
1524 // If a PHI node has users outside of InnerExit, it has a use outside the
1525 // interchanged loop and we have to preserve it. We move these to
1526 // InnerLatch, which will become the new exit block for the innermost
1527 // loop after interchanging.
1528 for (PHINode *P : LcssaInnerExit)
1529 P->moveBefore(InnerLatch->getFirstNonPHI());
1530
1531 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
1532 // and we have to move them to the new inner latch.
1533 for (PHINode *P : LcssaInnerLatch)
1534 P->moveBefore(InnerExit->getFirstNonPHI());
1535
1536 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
1537 // incoming values defined in the outer loop, we have to add a new PHI
1538 // in the inner loop latch, which became the exit block of the outer loop,
1539 // after interchanging.
1540 if (OuterExit) {
1541 for (PHINode &P : OuterExit->phis()) {
1542 if (P.getNumIncomingValues() != 1)
1543 continue;
1544 // Skip Phis with incoming values defined in the inner loop. Those should
1545 // already have been updated.
1546 auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
1547 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
1548 continue;
1549
1550 PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
1551 NewPhi->setIncomingValue(0, P.getIncomingValue(0));
1552 NewPhi->setIncomingBlock(0, OuterLatch);
1553 // We might have incoming edges from other BBs, i.e., the original outer
1554 // header.
1555 for (auto *Pred : predecessors(InnerLatch)) {
1556 if (Pred == OuterLatch)
1557 continue;
1558 NewPhi->addIncoming(P.getIncomingValue(0), Pred);
1559 }
1560 NewPhi->insertBefore(InnerLatch->getFirstNonPHI());
1561 P.setIncomingValue(0, NewPhi);
1562 }
1563 }
1564
1565 // Now adjust the incoming blocks for the LCSSA PHIs.
1566 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
1567 // with the new latch.
1568 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
1569}
1570
1571bool LoopInterchangeTransform::adjustLoopBranches() {
1572 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
1573 std::vector<DominatorTree::UpdateType> DTUpdates;
1574
1575 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1576 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1577
1578 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
1579 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
1580 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
1581 // Ensure that both preheaders do not contain PHI nodes and have single
1582 // predecessors. This allows us to move them easily. We use
1583 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
1584 // preheaders do not satisfy those conditions.
1585 if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
1586 !OuterLoopPreHeader->getUniquePredecessor())
1587 OuterLoopPreHeader =
1588 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
1589 if (InnerLoopPreHeader == OuterLoop->getHeader())
1590 InnerLoopPreHeader =
1591 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
1592
1593 // Adjust the loop preheader
1594 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1595 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1596 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1597 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1598 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1599 BasicBlock *InnerLoopLatchPredecessor =
1600 InnerLoopLatch->getUniquePredecessor();
1601 BasicBlock *InnerLoopLatchSuccessor;
1602 BasicBlock *OuterLoopLatchSuccessor;
1603
1604 BranchInst *OuterLoopLatchBI =
1605 dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
1606 BranchInst *InnerLoopLatchBI =
1607 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
1608 BranchInst *OuterLoopHeaderBI =
1609 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
1610 BranchInst *InnerLoopHeaderBI =
1611 dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
1612
1613 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
1614 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
1615 !InnerLoopHeaderBI)
1616 return false;
1617
1618 BranchInst *InnerLoopLatchPredecessorBI =
1619 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
1620 BranchInst *OuterLoopPredecessorBI =
1621 dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
1622
1623 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
1624 return false;
1625 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
1626 if (!InnerLoopHeaderSuccessor)
1627 return false;
1628
1629 // Adjust Loop Preheader and headers.
1630 // The branches in the outer loop predecessor and the outer loop header can
1631 // be unconditional branches or conditional branches with duplicates. Consider
1632 // this when updating the successors.
1633 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
1634 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
1635 // The outer loop header might or might not branch to the outer latch.
1636 // We are guaranteed to branch to the inner loop preheader.
1637 if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) {
1638 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
1639 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
1640 DTUpdates,
1641 /*MustUpdateOnce=*/false);
1642 }
1643 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
1644 InnerLoopHeaderSuccessor, DTUpdates,
1645 /*MustUpdateOnce=*/false);
1646
1647 // Adjust reduction PHI's now that the incoming block has changed.
1648 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
1649 OuterLoopHeader);
1650
1651 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
1652 OuterLoopPreHeader, DTUpdates);
1653
1654 // -------------Adjust loop latches-----------
1655 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
1656 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
1657 else
1658 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
1659
1660 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
1661 InnerLoopLatchSuccessor, DTUpdates);
1662
1663 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
1664 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
1665 else
1666 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
1667
1668 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
1669 OuterLoopLatchSuccessor, DTUpdates);
1670 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
1671 DTUpdates);
1672
1673 DT->applyUpdates(DTUpdates);
1674 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
1675 OuterLoopPreHeader);
1676
1677 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
1678 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
1679 InnerLoop, LI);
1680 // For PHIs in the exit block of the outer loop, outer's latch has been
1681 // replaced by Inners'.
1682 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1683
1684 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
1685 // Now update the reduction PHIs in the inner and outer loop headers.
1686 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
1687 for (PHINode &PHI : InnerLoopHeader->phis())
1688 if (OuterInnerReductions.contains(&PHI))
1689 InnerLoopPHIs.push_back(&PHI);
1690
1691 for (PHINode &PHI : OuterLoopHeader->phis())
1692 if (OuterInnerReductions.contains(&PHI))
1693 OuterLoopPHIs.push_back(&PHI);
1694
1695 // Now move the remaining reduction PHIs from outer to inner loop header and
1696 // vice versa. The PHI nodes must be part of a reduction across the inner and
1697 // outer loop and all the remains to do is and updating the incoming blocks.
1698 for (PHINode *PHI : OuterLoopPHIs) {
1699 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
1700 PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
1701 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1702 }
1703 for (PHINode *PHI : InnerLoopPHIs) {
1704 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
1705 PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
1706 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1707 }
1708
1709 // Update the incoming blocks for moved PHI nodes.
1710 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
1711 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
1712 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
1713 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1714
1715 // Values defined in the outer loop header could be used in the inner loop
1716 // latch. In that case, we need to create LCSSA phis for them, because after
1717 // interchanging they will be defined in the new inner loop and used in the
1718 // new outer loop.
1719 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
1720 for (Instruction &I :
1721 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
1722 MayNeedLCSSAPhis.push_back(&I);
1723 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE);
1724
1725 return true;
1726}
1727
1728bool LoopInterchangeTransform::adjustLoopLinks() {
1729 // Adjust all branches in the inner and outer loop.
1730 bool Changed = adjustLoopBranches();
1731 if (Changed) {
1732 // We have interchanged the preheaders so we need to interchange the data in
1733 // the preheaders as well. This is because the content of the inner
1734 // preheader was previously executed inside the outer loop.
1735 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1736 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1737 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
1738 }
1739 return Changed;
1740}
1741
1745 LPMUpdater &U) {
1746 Function &F = *LN.getParent();
1747 SmallVector<Loop *, 8> LoopList(LN.getLoops());
1748
1749 if (MaxMemInstrCount < 1) {
1750 LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1");
1751 return PreservedAnalyses::all();
1752 }
1754
1755 // Ensure minimum depth of the loop nest to do the interchange.
1756 if (!hasSupportedLoopDepth(LoopList, ORE))
1757 return PreservedAnalyses::all();
1758 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
1759 std::unique_ptr<CacheCost> CC =
1761
1762 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
1763 return PreservedAnalyses::all();
1764 U.markLoopNestChanged(true);
1766}
Rewrite undef for PHI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Given that RA is a live propagate it s liveness to any other values it uses(according to Uses). void DeadArgumentEliminationPass
#define LLVM_DEBUG(...)
Definition: Debug.h:106
Hexagon Common GEP
This file defines the interface for the loop cache analysis.
static cl::opt< unsigned int > MaxMemInstrCount("loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden, cl::desc("Maximum number of load-store instructions that should be handled " "in the dependency matrix. Higher value may lead to more interchanges " "at the cost of compile-time"))
static cl::opt< int > LoopInterchangeCostThreshold("loop-interchange-threshold", cl::init(0), cl::Hidden, cl::desc("Interchange if you gain more than this number"))
static PHINode * findInnerReductionPhi(Loop *L, Value *V)
static cl::opt< unsigned int > MinLoopNestDepth("loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden, cl::desc("Minimum depth of loop nest considered for the transform"))
static bool hasSupportedLoopDepth(SmallVectorImpl< Loop * > &LoopList, OptimizationRemarkEmitter &ORE)
static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore)
Move all instructions except the terminator from FromBB right before InsertBefore.
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, unsigned ToIndx)
static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, BasicBlock *OuterLatch, BasicBlock *OuterExit, Loop *InnerLoop, LoopInfo *LI)
static void printDepMatrix(CharMatrix &DepMatrix)
static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2)
Swap instructions between BB1 and BB2 but keep terminators intact.
static cl::opt< unsigned int > MaxLoopNestDepth("loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, cl::desc("Maximum depth of loop nest considered for the transform"))
static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId)
static bool areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, SmallPtrSetImpl< PHINode * > &Reductions)
#define DEBUG_TYPE
static Value * followLCSSA(Value *SV)
static void populateWorklist(Loop &L, LoopVector &LoopList)
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, BasicBlock *NewBB, std::vector< DominatorTree::UpdateType > &DTUpdates, bool MustUpdateOnce=true)
static bool isLexicographicallyPositive(std::vector< char > &DV)
This file defines the interface for the loop nest analysis.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
uint64_t IntrinsicInst * II
#define P(N)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static bool isProfitable(const SmallVector< std::unique_ptr< StableFunctionMap::StableFunctionEntry > > &SFS)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
StringSet - A set-like wrapper for the StringMap.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:517
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:250
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:367
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:497
void replacePhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block to refer to basic block New instead of basic block Old.
Definition: BasicBlock.cpp:651
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:467
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:631
Conditional or Unconditional Branch instruction.
iterator_range< succ_op_iterator > successors()
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
static std::unique_ptr< CacheCost > getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI, std::optional< unsigned > TRT=std::nullopt)
Create a CacheCost for the loop nest rooted by Root.
This class represents a function call, abstracting a target machine's calling convention.
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:661
This class represents an Operation in the Expression.
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
DependenceInfo - This class is the main dependence-analysis driver.
std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool PossiblyLoopIndependent)
depends - Tests for a dependence between the Src and Dst instructions.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
A struct for saving information about induction variables.
static bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr=nullptr, SmallVectorImpl< Instruction * > *CastsToIgnore=nullptr)
Returns true if Phi is an induction in the loop L.
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:99
bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void removeBlockFromLoop(BlockT *BB)
This removes the specified basic block from the current loop, updating the Blocks as appropriate.
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
void addBlockEntry(BlockT *BB)
This adds a basic block directly to the basic block list.
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
iterator begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
LoopT * removeChildLoop(iterator I)
This removes the specified child from being a subloop of this loop.
void changeTopLevelLoop(LoopT *OldLoop, LoopT *NewLoop)
Replace the specified loop in the top-level loops list with the indicated loop.
void changeLoopFor(BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
static const BasicBlock & skipEmptyBlockUntil(const BasicBlock *From, const BasicBlock *End, bool CheckUniquePred=false)
Recursivelly traverse all empty 'single successor' basic blocks of From (if there are any).
ArrayRef< Loop * > getLoops() const
Get the loops in the nest.
Function * getParent() const
Return the function to which the loop-nest belongs.
Loop & getOutermostLoop() const
Return the outermost loop in the loop nest.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
void setIncomingBlock(unsigned i, BasicBlock *BB)
void setIncomingValue(unsigned i, Value *V)
static unsigned getIncomingValueNumForOperand(unsigned i)
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:77
Instruction * getExactFPMathInst() const
Returns 1st non-reassociative FP instruction in the PHI node's use-chain.
static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr, ScalarEvolution *SE=nullptr)
Returns true if Phi is a reduction in TheLoop.
This node represents a polynomial recurrence on the trip count of the specified loop.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
StringSet - A wrapper for StringMap that provides set-like functionality.
Definition: StringSet.h:23
std::pair< typename Base::iterator, bool > insert(StringRef key)
Definition: StringSet.h:38
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:288
LLVM Value Representation.
Definition: Value.h:74
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
auto successors(const MachineBasicBlock *BB)
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:465
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1952
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1873
bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition: LCSSA.cpp:325
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:215