LLVM 22.0.0git
LoopInterchange.cpp
Go to the documentation of this file.
1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringMap.h"
21#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/InstrTypes.h"
35#include "llvm/IR/Instruction.h"
37#include "llvm/IR/User.h"
38#include "llvm/IR/Value.h"
41#include "llvm/Support/Debug.h"
48#include <cassert>
49#include <utility>
50#include <vector>
51
52using namespace llvm;
53
54#define DEBUG_TYPE "loop-interchange"
55
56STATISTIC(LoopsInterchanged, "Number of loops interchanged");
57
59 "loop-interchange-threshold", cl::init(0), cl::Hidden,
60 cl::desc("Interchange if you gain more than this number"));
61
62// Maximum number of load-stores that can be handled in the dependency matrix.
64 "loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden,
66 "Maximum number of load-store instructions that should be handled "
67 "in the dependency matrix. Higher value may lead to more interchanges "
68 "at the cost of compile-time"));
69
70namespace {
71
73
74/// A list of direction vectors. Each entry represents a direction vector
75/// corresponding to one or more dependencies existing in the loop nest. The
76/// length of all direction vectors is equal and is N + 1, where N is the depth
77/// of the loop nest. The first N elements correspond to the dependency
78/// direction of each N loops. The last one indicates whether this entry is
79/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
80/// is defined in LoopAccessAnalysis.
81// TODO: Check if we can use a sparse matrix here.
82using CharMatrix = std::vector<std::vector<char>>;
83
84/// Types of rules used in profitability check.
85enum class RuleTy {
86 PerLoopCacheAnalysis,
87 PerInstrOrderCost,
88 ForVectorization,
89 Ignore
90};
91
92} // end anonymous namespace
93
94// Minimum loop depth supported.
96 "loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden,
97 cl::desc("Minimum depth of loop nest considered for the transform"));
98
99// Maximum loop depth supported.
101 "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
102 cl::desc("Maximum depth of loop nest considered for the transform"));
103
104// We prefer cache cost to vectorization by default.
106 "loop-interchange-profitabilities", cl::ZeroOrMore,
108 cl::desc("List of profitability heuristics to be used. They are applied in "
109 "the given order"),
110 cl::list_init<RuleTy>({RuleTy::PerLoopCacheAnalysis,
111 RuleTy::PerInstrOrderCost,
112 RuleTy::ForVectorization}),
113 cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
114 "Prioritize loop cache cost"),
115 clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
116 "Prioritize the IVs order of each instruction"),
117 clEnumValN(RuleTy::ForVectorization, "vectorize",
118 "Prioritize vectorization"),
119 clEnumValN(RuleTy::Ignore, "ignore",
120 "Ignore profitability, force interchange (does not "
121 "work with other options)")));
122
123#ifndef NDEBUG
126 for (RuleTy Rule : Rules) {
127 if (!Set.insert(Rule).second)
128 return false;
129 if (Rule == RuleTy::Ignore)
130 return false;
131 }
132 return true;
133}
134
135static void printDepMatrix(CharMatrix &DepMatrix) {
136 for (auto &Row : DepMatrix) {
137 // Drop the last element because it is a flag indicating whether this is
138 // forward dependency or not, which doesn't affect the legality check.
139 for (char D : drop_end(Row))
140 LLVM_DEBUG(dbgs() << D << " ");
141 LLVM_DEBUG(dbgs() << "\n");
142 }
143}
144
145/// Return true if \p Src appears before \p Dst in the same basic block.
146/// Precondition: \p Src and \Dst are distinct instructions within the same
147/// basic block.
148static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
149 assert(Src->getParent() == Dst->getParent() && Src != Dst &&
150 "Expected Src and Dst to be different instructions in the same BB");
151
152 bool FoundSrc = false;
153 for (const Instruction &I : *(Src->getParent())) {
154 if (&I == Src) {
155 FoundSrc = true;
156 continue;
157 }
158 if (&I == Dst)
159 return FoundSrc;
160 }
161
162 llvm_unreachable("Dst not found");
163}
164#endif
165
166static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
167 Loop *L, DependenceInfo *DI,
168 ScalarEvolution *SE,
171
172 ValueVector MemInstr;
173
174 // For each block.
175 for (BasicBlock *BB : L->blocks()) {
176 // Scan the BB and collect legal loads and stores.
177 for (Instruction &I : *BB) {
178 if (!isa<Instruction>(I))
179 return false;
180 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
181 if (!Ld->isSimple())
182 return false;
183 MemInstr.push_back(&I);
184 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
185 if (!St->isSimple())
186 return false;
187 MemInstr.push_back(&I);
188 }
189 }
190 }
191
192 LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
193 << " Loads and Stores to analyze\n");
194 if (MemInstr.size() > MaxMemInstrCount) {
195 LLVM_DEBUG(dbgs() << "The transform doesn't support more than "
196 << MaxMemInstrCount << " load/stores in a loop\n");
197 ORE->emit([&]() {
198 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
199 L->getStartLoc(), L->getHeader())
200 << "Number of loads/stores exceeded, the supported maximum "
201 "can be increased with option "
202 "-loop-interchange-maxmeminstr-count.";
203 });
204 return false;
205 }
206 ValueVector::iterator I, IE, J, JE;
207
208 // Manage direction vectors that are already seen. Map each direction vector
209 // to an index of DepMatrix at which it is stored.
211
212 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
213 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
214 std::vector<char> Dep;
217 // Ignore Input dependencies.
218 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
219 continue;
220 // Track Output, Flow, and Anti dependencies.
221 if (auto D = DI->depends(Src, Dst)) {
222 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
223 // If the direction vector is negative, normalize it to
224 // make it non-negative.
225 if (D->normalize(SE))
226 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
227 LLVM_DEBUG(StringRef DepType =
228 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
229 dbgs() << "Found " << DepType
230 << " dependency between Src and Dst\n"
231 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
232 unsigned Levels = D->getLevels();
233 char Direction;
234 for (unsigned II = 1; II <= Levels; ++II) {
235 // `DVEntry::LE` is converted to `*`. This is because `LE` means `<`
236 // or `=`, for which we don't have an equivalent representation, so
237 // that the conservative approximation is necessary. The same goes for
238 // `DVEntry::GE`.
239 // TODO: Use of fine-grained expressions allows for more accurate
240 // analysis.
241 unsigned Dir = D->getDirection(II);
242 if (Dir == Dependence::DVEntry::LT)
243 Direction = '<';
244 else if (Dir == Dependence::DVEntry::GT)
245 Direction = '>';
246 else if (Dir == Dependence::DVEntry::EQ)
247 Direction = '=';
248 else
249 Direction = '*';
250 Dep.push_back(Direction);
251 }
252
253 // If the Dependence object doesn't have any information, fill the
254 // dependency vector with '*'.
255 if (D->isConfused()) {
256 assert(Dep.empty() && "Expected empty dependency vector");
257 Dep.assign(Level, '*');
258 }
259
260 while (Dep.size() != Level) {
261 Dep.push_back('I');
262 }
263
264 // If all the elements of any direction vector have only '*', legality
265 // can't be proven. Exit early to save compile time.
266 if (all_of(Dep, [](char C) { return C == '*'; })) {
267 ORE->emit([&]() {
268 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
269 L->getStartLoc(), L->getHeader())
270 << "All loops have dependencies in all directions.";
271 });
272 return false;
273 }
274
275 // Test whether the dependency is forward or not.
276 bool IsKnownForward = true;
277 if (Src->getParent() != Dst->getParent()) {
278 // In general, when Src and Dst are in different BBs, the execution
279 // order of them within a single iteration is not guaranteed. Treat
280 // conservatively as not-forward dependency in this case.
281 IsKnownForward = false;
282 } else {
283 // Src and Dst are in the same BB. If they are the different
284 // instructions, Src should appear before Dst in the BB as they are
285 // stored to MemInstr in that order.
286 assert((Src == Dst || inThisOrder(Src, Dst)) &&
287 "Unexpected instructions");
288
289 // If the Dependence object is reversed (due to normalization), it
290 // represents the dependency from Dst to Src, meaning it is a backward
291 // dependency. Otherwise it should be a forward dependency.
292 bool IsReversed = D->getSrc() != Src;
293 if (IsReversed)
294 IsKnownForward = false;
295 }
296
297 // Initialize the last element. Assume forward dependencies only; it
298 // will be updated later if there is any non-forward dependency.
299 Dep.push_back('<');
300
301 // The last element should express the "summary" among one or more
302 // direction vectors whose first N elements are the same (where N is
303 // the depth of the loop nest). Hence we exclude the last element from
304 // the Seen map.
305 auto [Ite, Inserted] = Seen.try_emplace(
306 StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());
307
308 // Make sure we only add unique entries to the dependency matrix.
309 if (Inserted)
310 DepMatrix.push_back(Dep);
311
312 // If we cannot prove that this dependency is forward, change the last
313 // element of the corresponding entry. Since a `[... *]` dependency
314 // includes a `[... <]` dependency, we do not need to keep both and
315 // change the existing entry instead.
316 if (!IsKnownForward)
317 DepMatrix[Ite->second].back() = '*';
318 }
319 }
320 }
321
322 return true;
323}
324
325// A loop is moved from index 'from' to an index 'to'. Update the Dependence
326// matrix by exchanging the two columns.
327static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
328 unsigned ToIndx) {
329 for (auto &Row : DepMatrix)
330 std::swap(Row[ToIndx], Row[FromIndx]);
331}
332
333// Check if a direction vector is lexicographically positive. Return true if it
334// is positive, nullopt if it is "zero", otherwise false.
335// [Theorem] A permutation of the loops in a perfect nest is legal if and only
336// if the direction matrix, after the same permutation is applied to its
337// columns, has no ">" direction as the leftmost non-"=" direction in any row.
338static std::optional<bool>
339isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
340 for (unsigned char Direction : DV.slice(Begin, End - Begin)) {
341 if (Direction == '<')
342 return true;
343 if (Direction == '>' || Direction == '*')
344 return false;
345 }
346 return std::nullopt;
347}
348
349// Checks if it is legal to interchange 2 loops.
350static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
351 unsigned InnerLoopId,
352 unsigned OuterLoopId) {
353 unsigned NumRows = DepMatrix.size();
354 std::vector<char> Cur;
355 // For each row check if it is valid to interchange.
356 for (unsigned Row = 0; Row < NumRows; ++Row) {
357 // Create temporary DepVector check its lexicographical order
358 // before and after swapping OuterLoop vs InnerLoop
359 Cur = DepMatrix[Row];
360
361 // If the surrounding loops already ensure that the direction vector is
362 // lexicographically positive, nothing within the loop will be able to break
363 // the dependence. In such a case we can skip the subsequent check.
364 if (isLexicographicallyPositive(Cur, 0, OuterLoopId) == true)
365 continue;
366
367 // Check if the direction vector is lexicographically positive (or zero)
368 // for both before/after exchanged. Ignore the last element because it
369 // doesn't affect the legality.
370 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
371 return false;
372 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
373 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
374 return false;
375 }
376 return true;
377}
378
379static void populateWorklist(Loop &L, LoopVector &LoopList) {
380 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
381 << L.getHeader()->getParent()->getName() << " Loop: %"
382 << L.getHeader()->getName() << '\n');
383 assert(LoopList.empty() && "LoopList should initially be empty!");
384 Loop *CurrentLoop = &L;
385 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
386 while (!Vec->empty()) {
387 // The current loop has multiple subloops in it hence it is not tightly
388 // nested.
389 // Discard all loops above it added into Worklist.
390 if (Vec->size() != 1) {
391 LoopList = {};
392 return;
393 }
394
395 LoopList.push_back(CurrentLoop);
396 CurrentLoop = Vec->front();
397 Vec = &CurrentLoop->getSubLoops();
398 }
399 LoopList.push_back(CurrentLoop);
400}
401
404 unsigned LoopNestDepth = LoopList.size();
405 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
406 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
407 << ", the supported range is [" << MinLoopNestDepth
408 << ", " << MaxLoopNestDepth << "].\n");
409 Loop *OuterLoop = LoopList.front();
410 ORE.emit([&]() {
411 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
412 OuterLoop->getStartLoc(),
413 OuterLoop->getHeader())
414 << "Unsupported depth of loop nest, the supported range is ["
415 << std::to_string(MinLoopNestDepth) << ", "
416 << std::to_string(MaxLoopNestDepth) << "].\n";
417 });
418 return false;
419 }
420 return true;
421}
422
424 ArrayRef<Loop *> LoopList) {
425 for (Loop *L : LoopList) {
426 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
427 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
428 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
429 return false;
430 }
431 if (L->getNumBackEdges() != 1) {
432 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
433 return false;
434 }
435 if (!L->getExitingBlock()) {
436 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
437 return false;
438 }
439 }
440 return true;
441}
442
443namespace {
444
445/// LoopInterchangeLegality checks if it is legal to interchange the loop.
446class LoopInterchangeLegality {
447public:
448 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
449 OptimizationRemarkEmitter *ORE)
450 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
451
452 /// Check if the loops can be interchanged.
453 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
454 CharMatrix &DepMatrix);
455
456 /// Discover induction PHIs in the header of \p L. Induction
457 /// PHIs are added to \p Inductions.
458 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
459
460 /// Check if the loop structure is understood. We do not handle triangular
461 /// loops for now.
462 bool isLoopStructureUnderstood();
463
464 bool currentLimitations();
465
466 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
467 return OuterInnerReductions;
468 }
469
470 const ArrayRef<PHINode *> getInnerLoopInductions() const {
471 return InnerLoopInductions;
472 }
473
474 ArrayRef<Instruction *> getHasNoWrapReductions() const {
475 return HasNoWrapReductions;
476 }
477
478private:
479 bool tightlyNested(Loop *Outer, Loop *Inner);
480 bool containsUnsafeInstructions(BasicBlock *BB);
481
482 /// Discover induction and reduction PHIs in the header of \p L. Induction
483 /// PHIs are added to \p Inductions, reductions are added to
484 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
485 /// to be passed as \p InnerLoop.
486 bool findInductionAndReductions(Loop *L,
487 SmallVector<PHINode *, 8> &Inductions,
488 Loop *InnerLoop);
489
490 Loop *OuterLoop;
491 Loop *InnerLoop;
492
493 ScalarEvolution *SE;
494
495 /// Interface to emit optimization remarks.
496 OptimizationRemarkEmitter *ORE;
497
498 /// Set of reduction PHIs taking part of a reduction across the inner and
499 /// outer loop.
500 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
501
502 /// Set of inner loop induction PHIs
503 SmallVector<PHINode *, 8> InnerLoopInductions;
504
505 /// Hold instructions that have nuw/nsw flags and involved in reductions,
506 /// like integer addition/multiplication. Those flags must be dropped when
507 /// interchanging the loops.
508 SmallVector<Instruction *, 4> HasNoWrapReductions;
509};
510
511/// Manages information utilized by the profitability check for cache. The main
512/// purpose of this class is to delay the computation of CacheCost until it is
513/// actually needed.
514class CacheCostManager {
515 Loop *OutermostLoop;
516 LoopStandardAnalysisResults *AR;
517 DependenceInfo *DI;
518
519 /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
520 /// that the result can be nullptr.
521 std::optional<std::unique_ptr<CacheCost>> CC;
522
523 /// Maps each loop to an index representing the optimal position within the
524 /// loop-nest, as determined by the cache cost analysis.
525 DenseMap<const Loop *, unsigned> CostMap;
526
527 void computeIfUnitinialized();
528
529public:
530 CacheCostManager(Loop *OutermostLoop, LoopStandardAnalysisResults *AR,
531 DependenceInfo *DI)
532 : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
533 CacheCost *getCacheCost();
534 const DenseMap<const Loop *, unsigned> &getCostMap();
535};
536
537/// LoopInterchangeProfitability checks if it is profitable to interchange the
538/// loop.
539class LoopInterchangeProfitability {
540public:
541 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
542 OptimizationRemarkEmitter *ORE)
543 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
544
545 /// Check if the loop interchange is profitable.
546 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
547 unsigned InnerLoopId, unsigned OuterLoopId,
548 CharMatrix &DepMatrix, CacheCostManager &CCM);
549
550private:
551 int getInstrOrderCost();
552 std::optional<bool> isProfitablePerLoopCacheAnalysis(
553 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC);
554 std::optional<bool> isProfitablePerInstrOrderCost();
555 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
556 unsigned OuterLoopId,
557 CharMatrix &DepMatrix);
558 Loop *OuterLoop;
559 Loop *InnerLoop;
560
561 /// Scev analysis.
562 ScalarEvolution *SE;
563
564 /// Interface to emit optimization remarks.
565 OptimizationRemarkEmitter *ORE;
566};
567
568/// LoopInterchangeTransform interchanges the loop.
569class LoopInterchangeTransform {
570public:
571 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
572 LoopInfo *LI, DominatorTree *DT,
573 const LoopInterchangeLegality &LIL)
574 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
575
576 /// Interchange OuterLoop and InnerLoop.
577 bool transform(ArrayRef<Instruction *> DropNoWrapInsts);
578 void restructureLoops(Loop *NewInner, Loop *NewOuter,
579 BasicBlock *OrigInnerPreHeader,
580 BasicBlock *OrigOuterPreHeader);
581 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
582
583private:
584 bool adjustLoopLinks();
585 bool adjustLoopBranches();
586
587 Loop *OuterLoop;
588 Loop *InnerLoop;
589
590 /// Scev analysis.
591 ScalarEvolution *SE;
592
593 LoopInfo *LI;
594 DominatorTree *DT;
595
596 const LoopInterchangeLegality &LIL;
597};
598
599struct LoopInterchange {
600 ScalarEvolution *SE = nullptr;
601 LoopInfo *LI = nullptr;
602 DependenceInfo *DI = nullptr;
603 DominatorTree *DT = nullptr;
604 LoopStandardAnalysisResults *AR = nullptr;
605
606 /// Interface to emit optimization remarks.
607 OptimizationRemarkEmitter *ORE;
608
609 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
610 DominatorTree *DT, LoopStandardAnalysisResults *AR,
611 OptimizationRemarkEmitter *ORE)
612 : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
613
614 bool run(Loop *L) {
615 if (L->getParentLoop())
616 return false;
617 SmallVector<Loop *, 8> LoopList;
618 populateWorklist(*L, LoopList);
619 return processLoopList(LoopList);
620 }
621
622 bool run(LoopNest &LN) {
623 SmallVector<Loop *, 8> LoopList(LN.getLoops());
624 for (unsigned I = 1; I < LoopList.size(); ++I)
625 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
626 return false;
627 return processLoopList(LoopList);
628 }
629
630 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
631 // TODO: Add a better heuristic to select the loop to be interchanged based
632 // on the dependence matrix. Currently we select the innermost loop.
633 return LoopList.size() - 1;
634 }
635
636 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
637 bool Changed = false;
638
639 // Ensure proper loop nest depth.
640 assert(hasSupportedLoopDepth(LoopList, *ORE) &&
641 "Unsupported depth of loop nest.");
642
643 unsigned LoopNestDepth = LoopList.size();
644
645 LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
646 << "\n");
647
648 CharMatrix DependencyMatrix;
649 Loop *OuterMostLoop = *(LoopList.begin());
650 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
651 OuterMostLoop, DI, SE, ORE)) {
652 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
653 return false;
654 }
655
656 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
657 printDepMatrix(DependencyMatrix));
658
659 // Get the Outermost loop exit.
660 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
661 if (!LoopNestExit) {
662 LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
663 return false;
664 }
665
666 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
667 CacheCostManager CCM(LoopList[0], AR, DI);
668 // We try to achieve the globally optimal memory access for the loopnest,
669 // and do interchange based on a bubble-sort fasion. We start from
670 // the innermost loop, move it outwards to the best possible position
671 // and repeat this process.
672 for (unsigned j = SelecLoopId; j > 0; j--) {
673 bool ChangedPerIter = false;
674 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
675 bool Interchanged =
676 processLoop(LoopList, i, i - 1, DependencyMatrix, CCM);
677 ChangedPerIter |= Interchanged;
678 Changed |= Interchanged;
679 }
680 // Early abort if there was no interchange during an entire round of
681 // moving loops outwards.
682 if (!ChangedPerIter)
683 break;
684 }
685 return Changed;
686 }
687
688 bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
689 unsigned OuterLoopId,
690 std::vector<std::vector<char>> &DependencyMatrix,
691 CacheCostManager &CCM) {
692 Loop *OuterLoop = LoopList[OuterLoopId];
693 Loop *InnerLoop = LoopList[InnerLoopId];
694 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
695 << " and OuterLoopId = " << OuterLoopId << "\n");
696 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
697 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
698 LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
699 return false;
700 }
701 LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
702 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
703 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
704 DependencyMatrix, CCM)) {
705 LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
706 return false;
707 }
708
709 ORE->emit([&]() {
710 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
711 InnerLoop->getStartLoc(),
712 InnerLoop->getHeader())
713 << "Loop interchanged with enclosing loop.";
714 });
715
716 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
717 LIT.transform(LIL.getHasNoWrapReductions());
718 LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
719 LoopsInterchanged++;
720
721 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
722
723 // Loops interchanged, update LoopList accordingly.
724 std::swap(LoopList[OuterLoopId], LoopList[InnerLoopId]);
725 // Update the DependencyMatrix
726 interChangeDependencies(DependencyMatrix, InnerLoopId, OuterLoopId);
727
728 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
729 printDepMatrix(DependencyMatrix));
730
731 return true;
732 }
733};
734
735} // end anonymous namespace
736
737bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
738 return any_of(*BB, [](const Instruction &I) {
739 return I.mayHaveSideEffects() || I.mayReadFromMemory();
740 });
741}
742
743bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
744 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
745 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
746 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
747
748 LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
749
750 // A perfectly nested loop will not have any branch in between the outer and
751 // inner block i.e. outer header will branch to either inner preheader and
752 // outerloop latch.
753 BranchInst *OuterLoopHeaderBI =
754 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
755 if (!OuterLoopHeaderBI)
756 return false;
757
758 for (BasicBlock *Succ : successors(OuterLoopHeaderBI))
759 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
760 Succ != OuterLoopLatch)
761 return false;
762
763 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
764 // We do not have any basic block in between now make sure the outer header
765 // and outer loop latch doesn't contain any unsafe instructions.
766 if (containsUnsafeInstructions(OuterLoopHeader) ||
767 containsUnsafeInstructions(OuterLoopLatch))
768 return false;
769
770 // Also make sure the inner loop preheader does not contain any unsafe
771 // instructions. Note that all instructions in the preheader will be moved to
772 // the outer loop header when interchanging.
773 if (InnerLoopPreHeader != OuterLoopHeader &&
774 containsUnsafeInstructions(InnerLoopPreHeader))
775 return false;
776
777 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
778 // Ensure the inner loop exit block flows to the outer loop latch possibly
779 // through empty blocks.
780 const BasicBlock &SuccInner =
781 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
782 if (&SuccInner != OuterLoopLatch) {
783 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
784 << " does not lead to the outer loop latch.\n";);
785 return false;
786 }
787 // The inner loop exit block does flow to the outer loop latch and not some
788 // other BBs, now make sure it contains safe instructions, since it will be
789 // moved into the (new) inner loop after interchange.
790 if (containsUnsafeInstructions(InnerLoopExit))
791 return false;
792
793 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
794 // We have a perfect loop nest.
795 return true;
796}
797
798bool LoopInterchangeLegality::isLoopStructureUnderstood() {
799 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
800 for (PHINode *InnerInduction : InnerLoopInductions) {
801 unsigned Num = InnerInduction->getNumOperands();
802 for (unsigned i = 0; i < Num; ++i) {
803 Value *Val = InnerInduction->getOperand(i);
804 if (isa<Constant>(Val))
805 continue;
807 if (!I)
808 return false;
809 // TODO: Handle triangular loops.
810 // e.g. for(int i=0;i<N;i++)
811 // for(int j=i;j<N;j++)
812 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
813 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
814 InnerLoopPreheader &&
815 !OuterLoop->isLoopInvariant(I)) {
816 return false;
817 }
818 }
819 }
820
821 // TODO: Handle triangular loops of another form.
822 // e.g. for(int i=0;i<N;i++)
823 // for(int j=0;j<i;j++)
824 // or,
825 // for(int i=0;i<N;i++)
826 // for(int j=0;j*i<N;j++)
827 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
828 BranchInst *InnerLoopLatchBI =
829 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
830 if (!InnerLoopLatchBI->isConditional())
831 return false;
832 if (CmpInst *InnerLoopCmp =
833 dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {
834 Value *Op0 = InnerLoopCmp->getOperand(0);
835 Value *Op1 = InnerLoopCmp->getOperand(1);
836
837 // LHS and RHS of the inner loop exit condition, e.g.,
838 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
839 Value *Left = nullptr;
840 Value *Right = nullptr;
841
842 // Check if V only involves inner loop induction variable.
843 // Return true if V is InnerInduction, or a cast from
844 // InnerInduction, or a binary operator that involves
845 // InnerInduction and a constant.
846 std::function<bool(Value *)> IsPathToInnerIndVar;
847 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
848 if (llvm::is_contained(InnerLoopInductions, V))
849 return true;
850 if (isa<Constant>(V))
851 return true;
853 if (!I)
854 return false;
855 if (isa<CastInst>(I))
856 return IsPathToInnerIndVar(I->getOperand(0));
858 return IsPathToInnerIndVar(I->getOperand(0)) &&
859 IsPathToInnerIndVar(I->getOperand(1));
860 return false;
861 };
862
863 // In case of multiple inner loop indvars, it is okay if LHS and RHS
864 // are both inner indvar related variables.
865 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
866 return true;
867
868 // Otherwise we check if the cmp instruction compares an inner indvar
869 // related variable (Left) with a outer loop invariant (Right).
870 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
871 Left = Op0;
872 Right = Op1;
873 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
874 Left = Op1;
875 Right = Op0;
876 }
877
878 if (Left == nullptr)
879 return false;
880
881 const SCEV *S = SE->getSCEV(Right);
882 if (!SE->isLoopInvariant(S, OuterLoop))
883 return false;
884 }
885
886 return true;
887}
888
889// If SV is a LCSSA PHI node with a single incoming value, return the incoming
890// value.
891static Value *followLCSSA(Value *SV) {
893 if (!PHI)
894 return SV;
895
896 if (PHI->getNumIncomingValues() != 1)
897 return SV;
898 return followLCSSA(PHI->getIncomingValue(0));
899}
900
901// Check V's users to see if it is involved in a reduction in L.
902static PHINode *
904 SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
905 // Reduction variables cannot be constants.
906 if (isa<Constant>(V))
907 return nullptr;
908
909 for (Value *User : V->users()) {
911 if (PHI->getNumIncomingValues() == 1)
912 continue;
915 // Detect floating point reduction only when it can be reordered.
916 if (RD.getExactFPMathInst() != nullptr)
917 return nullptr;
918
920 switch (RK) {
921 case RecurKind::Or:
922 case RecurKind::And:
923 case RecurKind::Xor:
924 case RecurKind::SMin:
925 case RecurKind::SMax:
926 case RecurKind::UMin:
927 case RecurKind::UMax:
928 case RecurKind::FAdd:
929 case RecurKind::FMul:
930 case RecurKind::FMin:
931 case RecurKind::FMax:
937 case RecurKind::AnyOf:
938 return PHI;
939
940 // Change the order of integer addition/multiplication may change the
941 // semantics. Consider the following case:
942 //
943 // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
944 // int sum = 0;
945 // for (int i = 0; i < 2; i++)
946 // for (int j = 0; j < 2; j++)
947 // sum += A[j][i];
948 //
949 // If the above loops are exchanged, the addition will cause an
950 // overflow. To prevent this, we must drop the nuw/nsw flags from the
951 // addition/multiplication instructions when we actually exchanges the
952 // loops.
953 case RecurKind::Add:
954 case RecurKind::Mul: {
955 unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
957
958 // Bail out when we fail to collect reduction instructions chain.
959 if (Ops.empty())
960 return nullptr;
961
962 for (Instruction *I : Ops) {
963 assert(I->getOpcode() == OpCode &&
964 "Expected the instruction to be the reduction operation");
965 (void)OpCode;
966
967 // If the instruction has nuw/nsw flags, we must drop them when the
968 // transformation is actually performed.
969 if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
970 HasNoWrapInsts.push_back(I);
971 }
972 return PHI;
973 }
974
975 default:
976 return nullptr;
977 }
978 }
979 return nullptr;
980 }
981 }
982
983 return nullptr;
984}
985
986bool LoopInterchangeLegality::findInductionAndReductions(
987 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
988 if (!L->getLoopLatch() || !L->getLoopPredecessor())
989 return false;
990 for (PHINode &PHI : L->getHeader()->phis()) {
991 InductionDescriptor ID;
993 Inductions.push_back(&PHI);
994 else {
995 // PHIs in inner loops need to be part of a reduction in the outer loop,
996 // discovered when checking the PHIs of the outer loop earlier.
997 if (!InnerLoop) {
998 if (!OuterInnerReductions.count(&PHI)) {
999 LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
1000 "across the outer loop.\n");
1001 return false;
1002 }
1003 } else {
1004 assert(PHI.getNumIncomingValues() == 2 &&
1005 "Phis in loop header should have exactly 2 incoming values");
1006 // Check if we have a PHI node in the outer loop that has a reduction
1007 // result from the inner loop as an incoming value.
1008 Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
1009 PHINode *InnerRedPhi =
1010 findInnerReductionPhi(InnerLoop, V, HasNoWrapReductions);
1011 if (!InnerRedPhi ||
1012 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
1013 LLVM_DEBUG(
1014 dbgs()
1015 << "Failed to recognize PHI as an induction or reduction.\n");
1016 return false;
1017 }
1018 OuterInnerReductions.insert(&PHI);
1019 OuterInnerReductions.insert(InnerRedPhi);
1020 }
1021 }
1022 }
1023 return true;
1024}
1025
1026// This function indicates the current limitations in the transform as a result
1027// of which we do not proceed.
1028bool LoopInterchangeLegality::currentLimitations() {
1029 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1030
1031 // transform currently expects the loop latches to also be the exiting
1032 // blocks.
1033 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
1034 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
1035 !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
1036 !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
1037 LLVM_DEBUG(
1038 dbgs() << "Loops where the latch is not the exiting block are not"
1039 << " supported currently.\n");
1040 ORE->emit([&]() {
1041 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
1042 OuterLoop->getStartLoc(),
1043 OuterLoop->getHeader())
1044 << "Loops where the latch is not the exiting block cannot be"
1045 " interchange currently.";
1046 });
1047 return true;
1048 }
1049
1050 SmallVector<PHINode *, 8> Inductions;
1051 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
1052 LLVM_DEBUG(
1053 dbgs() << "Only outer loops with induction or reduction PHI nodes "
1054 << "are supported currently.\n");
1055 ORE->emit([&]() {
1056 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
1057 OuterLoop->getStartLoc(),
1058 OuterLoop->getHeader())
1059 << "Only outer loops with induction or reduction PHI nodes can be"
1060 " interchanged currently.";
1061 });
1062 return true;
1063 }
1064
1065 Inductions.clear();
1066 // For multi-level loop nests, make sure that all phi nodes for inner loops
1067 // at all levels can be recognized as a induction or reduction phi. Bail out
1068 // if a phi node at a certain nesting level cannot be properly recognized.
1069 Loop *CurLevelLoop = OuterLoop;
1070 while (!CurLevelLoop->getSubLoops().empty()) {
1071 // We already made sure that the loop nest is tightly nested.
1072 CurLevelLoop = CurLevelLoop->getSubLoops().front();
1073 if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) {
1074 LLVM_DEBUG(
1075 dbgs() << "Only inner loops with induction or reduction PHI nodes "
1076 << "are supported currently.\n");
1077 ORE->emit([&]() {
1078 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
1079 CurLevelLoop->getStartLoc(),
1080 CurLevelLoop->getHeader())
1081 << "Only inner loops with induction or reduction PHI nodes can be"
1082 " interchange currently.";
1083 });
1084 return true;
1085 }
1086 }
1087
1088 // TODO: Triangular loops are not handled for now.
1089 if (!isLoopStructureUnderstood()) {
1090 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
1091 ORE->emit([&]() {
1092 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
1093 InnerLoop->getStartLoc(),
1094 InnerLoop->getHeader())
1095 << "Inner loop structure not understood currently.";
1096 });
1097 return true;
1098 }
1099
1100 return false;
1101}
1102
1103bool LoopInterchangeLegality::findInductions(
1104 Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
1105 for (PHINode &PHI : L->getHeader()->phis()) {
1106 InductionDescriptor ID;
1108 Inductions.push_back(&PHI);
1109 }
1110 return !Inductions.empty();
1111}
1112
1113// We currently only support LCSSA PHI nodes in the inner loop exit, if their
1114// users are either reduction PHIs or PHIs outside the outer loop (which means
1115// the we are only interested in the final value after the loop).
1116static bool
1119 BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
1120 for (PHINode &PHI : InnerExit->phis()) {
1121 // Reduction lcssa phi will have only 1 incoming block that from loop latch.
1122 if (PHI.getNumIncomingValues() > 1)
1123 return false;
1124 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
1125 PHINode *PN = dyn_cast<PHINode>(U);
1126 return !PN ||
1127 (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
1128 })) {
1129 return false;
1130 }
1131 }
1132 return true;
1133}
1134
1135// We currently support LCSSA PHI nodes in the outer loop exit, if their
1136// incoming values do not come from the outer loop latch or if the
1137// outer loop latch has a single predecessor. In that case, the value will
1138// be available if both the inner and outer loop conditions are true, which
1139// will still be true after interchanging. If we have multiple predecessor,
1140// that may not be the case, e.g. because the outer loop latch may be executed
1141// if the inner loop is not executed.
1142static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1143 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
1144 for (PHINode &PHI : LoopNestExit->phis()) {
1145 for (Value *Incoming : PHI.incoming_values()) {
1147 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
1148 continue;
1149
1150 // The incoming value is defined in the outer loop latch. Currently we
1151 // only support that in case the outer loop latch has a single predecessor.
1152 // This guarantees that the outer loop latch is executed if and only if
1153 // the inner loop is executed (because tightlyNested() guarantees that the
1154 // outer loop header only branches to the inner loop or the outer loop
1155 // latch).
1156 // FIXME: We could weaken this logic and allow multiple predecessors,
1157 // if the values are produced outside the loop latch. We would need
1158 // additional logic to update the PHI nodes in the exit block as
1159 // well.
1160 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
1161 return false;
1162 }
1163 }
1164 return true;
1165}
1166
1167// In case of multi-level nested loops, it may occur that lcssa phis exist in
1168// the latch of InnerLoop, i.e., when defs of the incoming values are further
1169// inside the loopnest. Sometimes those incoming values are not available
1170// after interchange, since the original inner latch will become the new outer
1171// latch which may have predecessor paths that do not include those incoming
1172// values.
1173// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1174// multi-level loop nests.
1175static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1176 if (InnerLoop->getSubLoops().empty())
1177 return true;
1178 // If the original outer latch has only one predecessor, then values defined
1179 // further inside the looploop, e.g., in the innermost loop, will be available
1180 // at the new outer latch after interchange.
1181 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
1182 return true;
1183
1184 // The outer latch has more than one predecessors, i.e., the inner
1185 // exit and the inner header.
1186 // PHI nodes in the inner latch are lcssa phis where the incoming values
1187 // are defined further inside the loopnest. Check if those phis are used
1188 // in the original inner latch. If that is the case then bail out since
1189 // those incoming values may not be available at the new outer latch.
1190 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1191 for (PHINode &PHI : InnerLoopLatch->phis()) {
1192 for (auto *U : PHI.users()) {
1194 if (InnerLoopLatch == UI->getParent())
1195 return false;
1196 }
1197 }
1198 return true;
1199}
1200
1201bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1202 unsigned OuterLoopId,
1203 CharMatrix &DepMatrix) {
1204 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1205 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1206 << " and OuterLoopId = " << OuterLoopId
1207 << " due to dependence\n");
1208 ORE->emit([&]() {
1209 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
1210 InnerLoop->getStartLoc(),
1211 InnerLoop->getHeader())
1212 << "Cannot interchange loops due to dependences.";
1213 });
1214 return false;
1215 }
1216 // Check if outer and inner loop contain legal instructions only.
1217 for (auto *BB : OuterLoop->blocks())
1218 for (Instruction &I : BB->instructionsWithoutDebug())
1219 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
1220 // readnone functions do not prevent interchanging.
1221 if (CI->onlyWritesMemory())
1222 continue;
1223 LLVM_DEBUG(
1224 dbgs() << "Loops with call instructions cannot be interchanged "
1225 << "safely.");
1226 ORE->emit([&]() {
1227 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
1228 CI->getDebugLoc(),
1229 CI->getParent())
1230 << "Cannot interchange loops due to call instruction.";
1231 });
1232
1233 return false;
1234 }
1235
1236 if (!findInductions(InnerLoop, InnerLoopInductions)) {
1237 LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");
1238 return false;
1239 }
1240
1241 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1242 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1243 ORE->emit([&]() {
1244 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1245 InnerLoop->getStartLoc(),
1246 InnerLoop->getHeader())
1247 << "Cannot interchange loops because unsupported PHI nodes found "
1248 "in inner loop latch.";
1249 });
1250 return false;
1251 }
1252
1253 // TODO: The loops could not be interchanged due to current limitations in the
1254 // transform module.
1255 if (currentLimitations()) {
1256 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1257 return false;
1258 }
1259
1260 // Check if the loops are tightly nested.
1261 if (!tightlyNested(OuterLoop, InnerLoop)) {
1262 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1263 ORE->emit([&]() {
1264 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1265 InnerLoop->getStartLoc(),
1266 InnerLoop->getHeader())
1267 << "Cannot interchange loops because they are not tightly "
1268 "nested.";
1269 });
1270 return false;
1271 }
1272
1273 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
1274 OuterInnerReductions)) {
1275 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1276 ORE->emit([&]() {
1277 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1278 InnerLoop->getStartLoc(),
1279 InnerLoop->getHeader())
1280 << "Found unsupported PHI node in loop exit.";
1281 });
1282 return false;
1283 }
1284
1285 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1286 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1287 ORE->emit([&]() {
1288 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1289 OuterLoop->getStartLoc(),
1290 OuterLoop->getHeader())
1291 << "Found unsupported PHI node in loop exit.";
1292 });
1293 return false;
1294 }
1295
1296 return true;
1297}
1298
1299void CacheCostManager::computeIfUnitinialized() {
1300 if (CC.has_value())
1301 return;
1302
1303 LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
1304 CC = CacheCost::getCacheCost(*OutermostLoop, *AR, *DI);
1305 // Obtain the loop vector returned from loop cache analysis beforehand,
1306 // and put each <Loop, index> pair into a map for constant time query
1307 // later. Indices in loop vector reprsent the optimal order of the
1308 // corresponding loop, e.g., given a loopnest with depth N, index 0
1309 // indicates the loop should be placed as the outermost loop and index N
1310 // indicates the loop should be placed as the innermost loop.
1311 //
1312 // For the old pass manager CacheCost would be null.
1313 if (*CC != nullptr)
1314 for (const auto &[Idx, Cost] : enumerate((*CC)->getLoopCosts()))
1315 CostMap[Cost.first] = Idx;
1316}
1317
1318CacheCost *CacheCostManager::getCacheCost() {
1319 computeIfUnitinialized();
1320 return CC->get();
1321}
1322
1323const DenseMap<const Loop *, unsigned> &CacheCostManager::getCostMap() {
1324 computeIfUnitinialized();
1325 return CostMap;
1326}
1327
1328int LoopInterchangeProfitability::getInstrOrderCost() {
1329 unsigned GoodOrder, BadOrder;
1330 BadOrder = GoodOrder = 0;
1331 for (BasicBlock *BB : InnerLoop->blocks()) {
1332 for (Instruction &Ins : *BB) {
1333 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
1334 bool FoundInnerInduction = false;
1335 bool FoundOuterInduction = false;
1336 for (Value *Op : GEP->operands()) {
1337 // Skip operands that are not SCEV-able.
1338 if (!SE->isSCEVable(Op->getType()))
1339 continue;
1340
1341 const SCEV *OperandVal = SE->getSCEV(Op);
1342 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1343 if (!AR)
1344 continue;
1345
1346 // If we find the inner induction after an outer induction e.g.
1347 // for(int i=0;i<N;i++)
1348 // for(int j=0;j<N;j++)
1349 // A[i][j] = A[i-1][j-1]+k;
1350 // then it is a good order.
1351 if (AR->getLoop() == InnerLoop) {
1352 // We found an InnerLoop induction after OuterLoop induction. It is
1353 // a good order.
1354 FoundInnerInduction = true;
1355 if (FoundOuterInduction) {
1356 GoodOrder++;
1357 break;
1358 }
1359 }
1360 // If we find the outer induction after an inner induction e.g.
1361 // for(int i=0;i<N;i++)
1362 // for(int j=0;j<N;j++)
1363 // A[j][i] = A[j-1][i-1]+k;
1364 // then it is a bad order.
1365 if (AR->getLoop() == OuterLoop) {
1366 // We found an OuterLoop induction after InnerLoop induction. It is
1367 // a bad order.
1368 FoundOuterInduction = true;
1369 if (FoundInnerInduction) {
1370 BadOrder++;
1371 break;
1372 }
1373 }
1374 }
1375 }
1376 }
1377 }
1378 return GoodOrder - BadOrder;
1379}
1380
1381std::optional<bool>
1382LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1383 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC) {
1384 // This is the new cost model returned from loop cache analysis.
1385 // A smaller index means the loop should be placed an outer loop, and vice
1386 // versa.
1387 auto InnerLoopIt = CostMap.find(InnerLoop);
1388 if (InnerLoopIt == CostMap.end())
1389 return std::nullopt;
1390 auto OuterLoopIt = CostMap.find(OuterLoop);
1391 if (OuterLoopIt == CostMap.end())
1392 return std::nullopt;
1393
1394 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1395 return std::nullopt;
1396 unsigned InnerIndex = InnerLoopIt->second;
1397 unsigned OuterIndex = OuterLoopIt->second;
1398 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1399 << ", OuterIndex = " << OuterIndex << "\n");
1400 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1401 "numbers to each loop");
1402 return std::optional<bool>(InnerIndex < OuterIndex);
1403}
1404
1405std::optional<bool>
1406LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1407 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1408 // good and bad order of induction variables in the instruction and allows
1409 // reordering if number of bad orders is more than good.
1410 int Cost = getInstrOrderCost();
1411 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1413 return std::optional<bool>(true);
1414
1415 return std::nullopt;
1416}
1417
1418/// Return true if we can vectorize the loop specified by \p LoopId.
1419static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1420 for (const auto &Dep : DepMatrix) {
1421 char Dir = Dep[LoopId];
1422 char DepType = Dep.back();
1423 assert((DepType == '<' || DepType == '*') &&
1424 "Unexpected element in dependency vector");
1425
1426 // There are no loop-carried dependencies.
1427 if (Dir == '=' || Dir == 'I')
1428 continue;
1429
1430 // DepType being '<' means that this direction vector represents a forward
1431 // dependency. In principle, a loop with '<' direction can be vectorized in
1432 // this case.
1433 if (Dir == '<' && DepType == '<')
1434 continue;
1435
1436 // We cannot prove that the loop is vectorizable.
1437 return false;
1438 }
1439 return true;
1440}
1441
1442std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1443 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1444 // If the outer loop cannot be vectorized, it is not profitable to move this
1445 // to inner position.
1446 if (!canVectorize(DepMatrix, OuterLoopId))
1447 return false;
1448
1449 // If the inner loop cannot be vectorized but the outer loop can be, then it
1450 // is profitable to interchange to enable inner loop parallelism.
1451 if (!canVectorize(DepMatrix, InnerLoopId))
1452 return true;
1453
1454 // If both the inner and the outer loop can be vectorized, it is necessary to
1455 // check the cost of each vectorized loop for profitability decision. At this
1456 // time we do not have a cost model to estimate them, so return nullopt.
1457 // TODO: Estimate the cost of vectorized loop when both the outer and the
1458 // inner loop can be vectorized.
1459 return std::nullopt;
1460}
1461
1462bool LoopInterchangeProfitability::isProfitable(
1463 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1464 unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
1465 // Do not consider loops with a backedge that isn't taken, e.g. an
1466 // unconditional branch true/false, as candidates for interchange.
1467 // TODO: when interchange is forced, we should probably also allow
1468 // interchange for these loops, and thus this logic should be moved just
1469 // below the cost-model ignore check below. But this check is done first
1470 // to avoid the issue in #163954.
1471 const SCEV *InnerBTC = SE->getBackedgeTakenCount(InnerLoop);
1472 const SCEV *OuterBTC = SE->getBackedgeTakenCount(OuterLoop);
1473 if (InnerBTC && InnerBTC->isZero()) {
1474 LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting "
1475 "single iteration loop\n");
1476 return false;
1477 }
1478 if (OuterBTC && OuterBTC->isZero()) {
1479 LLVM_DEBUG(dbgs() << "Outer loop back-edge isn't taken, rejecting "
1480 "single iteration loop\n");
1481 return false;
1482 }
1483
1484 // Return true if interchange is forced and the cost-model ignored.
1485 if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore)
1486 return true;
1488 "Duplicate rules and option 'ignore' are not allowed");
1489
1490 // isProfitable() is structured to avoid endless loop interchange. If the
1491 // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1492 // decide the profitability then, profitability check will stop and return the
1493 // analysis result. If it failed to determine it (e.g., cache analysis failed
1494 // to analyze the loopnest due to delinearization issues) then go ahead the
1495 // second highest priority rule (isProfitablePerInstrOrderCost by default).
1496 // Likewise, if it failed to analysis the profitability then only, the last
1497 // rule (isProfitableForVectorization by default) will decide.
1498 std::optional<bool> shouldInterchange;
1499 for (RuleTy RT : Profitabilities) {
1500 switch (RT) {
1501 case RuleTy::PerLoopCacheAnalysis: {
1502 CacheCost *CC = CCM.getCacheCost();
1503 const DenseMap<const Loop *, unsigned> &CostMap = CCM.getCostMap();
1504 shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1505 break;
1506 }
1507 case RuleTy::PerInstrOrderCost:
1508 shouldInterchange = isProfitablePerInstrOrderCost();
1509 break;
1510 case RuleTy::ForVectorization:
1511 shouldInterchange =
1512 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1513 break;
1514 case RuleTy::Ignore:
1515 llvm_unreachable("Option 'ignore' is not supported with other options");
1516 break;
1517 }
1518
1519 // If this rule could determine the profitability, don't call subsequent
1520 // rules.
1521 if (shouldInterchange.has_value())
1522 break;
1523 }
1524
1525 if (!shouldInterchange.has_value()) {
1526 ORE->emit([&]() {
1527 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1528 InnerLoop->getStartLoc(),
1529 InnerLoop->getHeader())
1530 << "Insufficient information to calculate the cost of loop for "
1531 "interchange.";
1532 });
1533 return false;
1534 } else if (!shouldInterchange.value()) {
1535 ORE->emit([&]() {
1536 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1537 InnerLoop->getStartLoc(),
1538 InnerLoop->getHeader())
1539 << "Interchanging loops is not considered to improve cache "
1540 "locality nor vectorization.";
1541 });
1542 return false;
1543 }
1544 return true;
1545}
1546
1547void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1548 Loop *InnerLoop) {
1549 for (Loop *L : *OuterLoop)
1550 if (L == InnerLoop) {
1551 OuterLoop->removeChildLoop(L);
1552 return;
1553 }
1554 llvm_unreachable("Couldn't find loop");
1555}
1556
1557/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1558/// new inner and outer loop after interchanging: NewInner is the original
1559/// outer loop and NewOuter is the original inner loop.
1560///
1561/// Before interchanging, we have the following structure
1562/// Outer preheader
1563// Outer header
1564// Inner preheader
1565// Inner header
1566// Inner body
1567// Inner latch
1568// outer bbs
1569// Outer latch
1570//
1571// After interchanging:
1572// Inner preheader
1573// Inner header
1574// Outer preheader
1575// Outer header
1576// Inner body
1577// outer bbs
1578// Outer latch
1579// Inner latch
1580void LoopInterchangeTransform::restructureLoops(
1581 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1582 BasicBlock *OrigOuterPreHeader) {
1583 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1584 // The original inner loop preheader moves from the new inner loop to
1585 // the parent loop, if there is one.
1586 NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1587 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1588
1589 // Switch the loop levels.
1590 if (OuterLoopParent) {
1591 // Remove the loop from its parent loop.
1592 removeChildLoop(OuterLoopParent, NewInner);
1593 removeChildLoop(NewInner, NewOuter);
1594 OuterLoopParent->addChildLoop(NewOuter);
1595 } else {
1596 removeChildLoop(NewInner, NewOuter);
1597 LI->changeTopLevelLoop(NewInner, NewOuter);
1598 }
1599 while (!NewOuter->isInnermost())
1600 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1601 NewOuter->addChildLoop(NewInner);
1602
1603 // BBs from the original inner loop.
1604 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1605
1606 // Add BBs from the original outer loop to the original inner loop (excluding
1607 // BBs already in inner loop)
1608 for (BasicBlock *BB : NewInner->blocks())
1609 if (LI->getLoopFor(BB) == NewInner)
1610 NewOuter->addBlockEntry(BB);
1611
1612 // Now remove inner loop header and latch from the new inner loop and move
1613 // other BBs (the loop body) to the new inner loop.
1614 BasicBlock *OuterHeader = NewOuter->getHeader();
1615 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1616 for (BasicBlock *BB : OrigInnerBBs) {
1617 // Nothing will change for BBs in child loops.
1618 if (LI->getLoopFor(BB) != NewOuter)
1619 continue;
1620 // Remove the new outer loop header and latch from the new inner loop.
1621 if (BB == OuterHeader || BB == OuterLatch)
1622 NewInner->removeBlockFromLoop(BB);
1623 else
1624 LI->changeLoopFor(BB, NewInner);
1625 }
1626
1627 // The preheader of the original outer loop becomes part of the new
1628 // outer loop.
1629 NewOuter->addBlockEntry(OrigOuterPreHeader);
1630 LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1631
1632 // Tell SE that we move the loops around.
1633 SE->forgetLoop(NewOuter);
1634}
1635
1636bool LoopInterchangeTransform::transform(
1637 ArrayRef<Instruction *> DropNoWrapInsts) {
1638 bool Transformed = false;
1639
1640 if (InnerLoop->getSubLoops().empty()) {
1641 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1642 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
1643 auto &InductionPHIs = LIL.getInnerLoopInductions();
1644 if (InductionPHIs.empty()) {
1645 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1646 return false;
1647 }
1648
1649 SmallVector<Instruction *, 8> InnerIndexVarList;
1650 for (PHINode *CurInductionPHI : InductionPHIs) {
1651 if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1652 InnerIndexVarList.push_back(
1653 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));
1654 else
1655 InnerIndexVarList.push_back(
1656 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));
1657 }
1658
1659 // Create a new latch block for the inner loop. We split at the
1660 // current latch's terminator and then move the condition and all
1661 // operands that are not either loop-invariant or the induction PHI into the
1662 // new latch block.
1663 BasicBlock *NewLatch =
1664 SplitBlock(InnerLoop->getLoopLatch(),
1665 InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1666
1667 SmallSetVector<Instruction *, 4> WorkList;
1668 unsigned i = 0;
1669 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
1670 for (; i < WorkList.size(); i++) {
1671 // Duplicate instruction and move it the new latch. Update uses that
1672 // have been moved.
1673 Instruction *NewI = WorkList[i]->clone();
1674 NewI->insertBefore(NewLatch->getFirstNonPHIIt());
1675 assert(!NewI->mayHaveSideEffects() &&
1676 "Moving instructions with side-effects may change behavior of "
1677 "the loop nest!");
1678 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
1679 Instruction *UserI = cast<Instruction>(U.getUser());
1680 if (!InnerLoop->contains(UserI->getParent()) ||
1681 UserI->getParent() == NewLatch ||
1682 llvm::is_contained(InductionPHIs, UserI))
1683 U.set(NewI);
1684 }
1685 // Add operands of moved instruction to the worklist, except if they are
1686 // outside the inner loop or are the induction PHI.
1687 for (Value *Op : WorkList[i]->operands()) {
1689 if (!OpI ||
1690 this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
1691 llvm::is_contained(InductionPHIs, OpI))
1692 continue;
1693 WorkList.insert(OpI);
1694 }
1695 }
1696 };
1697
1698 // FIXME: Should we interchange when we have a constant condition?
1701 ->getCondition());
1702 if (CondI)
1703 WorkList.insert(CondI);
1704 MoveInstructions();
1705 for (Instruction *InnerIndexVar : InnerIndexVarList)
1706 WorkList.insert(cast<Instruction>(InnerIndexVar));
1707 MoveInstructions();
1708 }
1709
1710 // Ensure the inner loop phi nodes have a separate basic block.
1711 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1712 if (&*InnerLoopHeader->getFirstNonPHIIt() !=
1713 InnerLoopHeader->getTerminator()) {
1714 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
1715 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
1716 }
1717
1718 // Instructions in the original inner loop preheader may depend on values
1719 // defined in the outer loop header. Move them there, because the original
1720 // inner loop preheader will become the entry into the interchanged loop nest.
1721 // Currently we move all instructions and rely on LICM to move invariant
1722 // instructions outside the loop nest.
1723 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1724 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1725 if (InnerLoopPreHeader != OuterLoopHeader) {
1726 for (Instruction &I :
1727 make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
1728 std::prev(InnerLoopPreHeader->end()))))
1729 I.moveBeforePreserving(OuterLoopHeader->getTerminator()->getIterator());
1730 }
1731
1732 Transformed |= adjustLoopLinks();
1733 if (!Transformed) {
1734 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
1735 return false;
1736 }
1737
1738 // Finally, drop the nsw/nuw flags from the instructions for reduction
1739 // calculations.
1740 for (Instruction *Reduction : DropNoWrapInsts) {
1741 Reduction->setHasNoSignedWrap(false);
1742 Reduction->setHasNoUnsignedWrap(false);
1743 }
1744
1745 return true;
1746}
1747
1748/// \brief Move all instructions except the terminator from FromBB right before
1749/// InsertBefore
1750static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
1751 BasicBlock *ToBB = InsertBefore->getParent();
1752
1753 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),
1754 FromBB->getTerminator()->getIterator());
1755}
1756
1757/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
1758static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
1759 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
1760 // from BB1 afterwards.
1761 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
1762 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
1763 for (Instruction *I : TempInstrs)
1764 I->removeFromParent();
1765
1766 // Move instructions from BB2 to BB1.
1767 moveBBContents(BB2, BB1->getTerminator());
1768
1769 // Move instructions from TempInstrs to BB2.
1770 for (Instruction *I : TempInstrs)
1771 I->insertBefore(BB2->getTerminator()->getIterator());
1772}
1773
1774// Update BI to jump to NewBB instead of OldBB. Records updates to the
1775// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
1776// \p OldBB is exactly once in BI's successor list.
1777static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
1778 BasicBlock *NewBB,
1779 std::vector<DominatorTree::UpdateType> &DTUpdates,
1780 bool MustUpdateOnce = true) {
1781 assert((!MustUpdateOnce || llvm::count(successors(BI), OldBB) == 1) &&
1782 "BI must jump to OldBB exactly once.");
1783 bool Changed = false;
1784 for (Use &Op : BI->operands())
1785 if (Op == OldBB) {
1786 Op.set(NewBB);
1787 Changed = true;
1788 }
1789
1790 if (Changed) {
1791 DTUpdates.push_back(
1792 {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
1793 DTUpdates.push_back(
1794 {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
1795 }
1796 assert(Changed && "Expected a successor to be updated");
1797}
1798
1799// Move Lcssa PHIs to the right place.
1800static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
1801 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
1802 BasicBlock *OuterLatch, BasicBlock *OuterExit,
1803 Loop *InnerLoop, LoopInfo *LI) {
1804
1805 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
1806 // defined either in the header or latch. Those blocks will become header and
1807 // latch of the new outer loop, and the only possible users can PHI nodes
1808 // in the exit block of the loop nest or the outer loop header (reduction
1809 // PHIs, in that case, the incoming value must be defined in the inner loop
1810 // header). We can just substitute the user with the incoming value and remove
1811 // the PHI.
1812 for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
1813 assert(P.getNumIncomingValues() == 1 &&
1814 "Only loops with a single exit are supported!");
1815
1816 // Incoming values are guaranteed be instructions currently.
1817 auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
1818 // In case of multi-level nested loops, follow LCSSA to find the incoming
1819 // value defined from the innermost loop.
1820 auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
1821 // Skip phis with incoming values from the inner loop body, excluding the
1822 // header and latch.
1823 if (IncIInnerMost->getParent() != InnerLatch &&
1824 IncIInnerMost->getParent() != InnerHeader)
1825 continue;
1826
1827 assert(all_of(P.users(),
1828 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
1829 return (cast<PHINode>(U)->getParent() == OuterHeader &&
1830 IncI->getParent() == InnerHeader) ||
1831 cast<PHINode>(U)->getParent() == OuterExit;
1832 }) &&
1833 "Can only replace phis iff the uses are in the loop nest exit or "
1834 "the incoming value is defined in the inner header (it will "
1835 "dominate all loop blocks after interchanging)");
1836 P.replaceAllUsesWith(IncI);
1837 P.eraseFromParent();
1838 }
1839
1840 SmallVector<PHINode *, 8> LcssaInnerExit(
1841 llvm::make_pointer_range(InnerExit->phis()));
1842
1843 SmallVector<PHINode *, 8> LcssaInnerLatch(
1844 llvm::make_pointer_range(InnerLatch->phis()));
1845
1846 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
1847 // If a PHI node has users outside of InnerExit, it has a use outside the
1848 // interchanged loop and we have to preserve it. We move these to
1849 // InnerLatch, which will become the new exit block for the innermost
1850 // loop after interchanging.
1851 for (PHINode *P : LcssaInnerExit)
1852 P->moveBefore(InnerLatch->getFirstNonPHIIt());
1853
1854 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
1855 // and we have to move them to the new inner latch.
1856 for (PHINode *P : LcssaInnerLatch)
1857 P->moveBefore(InnerExit->getFirstNonPHIIt());
1858
1859 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
1860 // incoming values defined in the outer loop, we have to add a new PHI
1861 // in the inner loop latch, which became the exit block of the outer loop,
1862 // after interchanging.
1863 if (OuterExit) {
1864 for (PHINode &P : OuterExit->phis()) {
1865 if (P.getNumIncomingValues() != 1)
1866 continue;
1867 // Skip Phis with incoming values defined in the inner loop. Those should
1868 // already have been updated.
1869 auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
1870 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
1871 continue;
1872
1873 PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
1874 NewPhi->setIncomingValue(0, P.getIncomingValue(0));
1875 NewPhi->setIncomingBlock(0, OuterLatch);
1876 // We might have incoming edges from other BBs, i.e., the original outer
1877 // header.
1878 for (auto *Pred : predecessors(InnerLatch)) {
1879 if (Pred == OuterLatch)
1880 continue;
1881 NewPhi->addIncoming(P.getIncomingValue(0), Pred);
1882 }
1883 NewPhi->insertBefore(InnerLatch->getFirstNonPHIIt());
1884 P.setIncomingValue(0, NewPhi);
1885 }
1886 }
1887
1888 // Now adjust the incoming blocks for the LCSSA PHIs.
1889 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
1890 // with the new latch.
1891 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
1892}
1893
1894/// This deals with a corner case when a LCSSA phi node appears in a non-exit
1895/// block: the outer loop latch block does not need to be exit block of the
1896/// inner loop. Consider a loop that was in LCSSA form, but then some
1897/// transformation like loop-unswitch comes along and creates an empty block,
1898/// where BB5 in this example is the outer loop latch block:
1899///
1900/// BB4:
1901/// br label %BB5
1902/// BB5:
1903/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
1904/// br outer.header
1905///
1906/// Interchange then brings it in LCSSA form again resulting in this chain of
1907/// single-input phi nodes:
1908///
1909/// BB4:
1910/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
1911/// br label %BB5
1912/// BB5:
1913/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
1914///
1915/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
1916/// use before the def if we don't check this. The solution is to simplify
1917/// lcssa phi nodes (remove) if they appear in non-exit blocks.
1918///
1919static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
1920 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
1921 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1922
1923 // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
1924 if (OuterLoopLatch == InnerLoopExit)
1925 return;
1926
1927 // Collect and remove phis in non-exit blocks if they have 1 input.
1929 llvm::make_pointer_range(OuterLoopLatch->phis()));
1930 for (PHINode *Phi : Phis) {
1931 assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
1932 LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
1933 << "\n");
1934 Phi->replaceAllUsesWith(Phi->getIncomingValue(0));
1935 Phi->eraseFromParent();
1936 }
1937}
1938
1939bool LoopInterchangeTransform::adjustLoopBranches() {
1940 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
1941 std::vector<DominatorTree::UpdateType> DTUpdates;
1942
1943 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1944 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1945
1946 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
1947 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
1948 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
1949
1950 simplifyLCSSAPhis(OuterLoop, InnerLoop);
1951
1952 // Ensure that both preheaders do not contain PHI nodes and have single
1953 // predecessors. This allows us to move them easily. We use
1954 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
1955 // preheaders do not satisfy those conditions.
1956 if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
1957 !OuterLoopPreHeader->getUniquePredecessor())
1958 OuterLoopPreHeader =
1959 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
1960 if (InnerLoopPreHeader == OuterLoop->getHeader())
1961 InnerLoopPreHeader =
1962 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
1963
1964 // Adjust the loop preheader
1965 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1966 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1967 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1968 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1969 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1970 BasicBlock *InnerLoopLatchPredecessor =
1971 InnerLoopLatch->getUniquePredecessor();
1972 BasicBlock *InnerLoopLatchSuccessor;
1973 BasicBlock *OuterLoopLatchSuccessor;
1974
1975 BranchInst *OuterLoopLatchBI =
1976 dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
1977 BranchInst *InnerLoopLatchBI =
1978 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
1979 BranchInst *OuterLoopHeaderBI =
1980 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
1981 BranchInst *InnerLoopHeaderBI =
1982 dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
1983
1984 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
1985 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
1986 !InnerLoopHeaderBI)
1987 return false;
1988
1989 BranchInst *InnerLoopLatchPredecessorBI =
1990 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
1991 BranchInst *OuterLoopPredecessorBI =
1992 dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
1993
1994 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
1995 return false;
1996 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
1997 if (!InnerLoopHeaderSuccessor)
1998 return false;
1999
2000 // Adjust Loop Preheader and headers.
2001 // The branches in the outer loop predecessor and the outer loop header can
2002 // be unconditional branches or conditional branches with duplicates. Consider
2003 // this when updating the successors.
2004 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
2005 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
2006 // The outer loop header might or might not branch to the outer latch.
2007 // We are guaranteed to branch to the inner loop preheader.
2008 if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) {
2009 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
2010 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
2011 DTUpdates,
2012 /*MustUpdateOnce=*/false);
2013 }
2014 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
2015 InnerLoopHeaderSuccessor, DTUpdates,
2016 /*MustUpdateOnce=*/false);
2017
2018 // Adjust reduction PHI's now that the incoming block has changed.
2019 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
2020 OuterLoopHeader);
2021
2022 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
2023 OuterLoopPreHeader, DTUpdates);
2024
2025 // -------------Adjust loop latches-----------
2026 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
2027 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
2028 else
2029 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
2030
2031 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
2032 InnerLoopLatchSuccessor, DTUpdates);
2033
2034 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
2035 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
2036 else
2037 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
2038
2039 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
2040 OuterLoopLatchSuccessor, DTUpdates);
2041 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
2042 DTUpdates);
2043
2044 DT->applyUpdates(DTUpdates);
2045 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
2046 OuterLoopPreHeader);
2047
2048 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
2049 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
2050 InnerLoop, LI);
2051 // For PHIs in the exit block of the outer loop, outer's latch has been
2052 // replaced by Inners'.
2053 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2054
2055 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
2056 // Now update the reduction PHIs in the inner and outer loop headers.
2057 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
2058 for (PHINode &PHI : InnerLoopHeader->phis())
2059 if (OuterInnerReductions.contains(&PHI))
2060 InnerLoopPHIs.push_back(&PHI);
2061
2062 for (PHINode &PHI : OuterLoopHeader->phis())
2063 if (OuterInnerReductions.contains(&PHI))
2064 OuterLoopPHIs.push_back(&PHI);
2065
2066 // Now move the remaining reduction PHIs from outer to inner loop header and
2067 // vice versa. The PHI nodes must be part of a reduction across the inner and
2068 // outer loop and all the remains to do is and updating the incoming blocks.
2069 for (PHINode *PHI : OuterLoopPHIs) {
2070 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
2071 PHI->moveBefore(InnerLoopHeader->getFirstNonPHIIt());
2072 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2073 }
2074 for (PHINode *PHI : InnerLoopPHIs) {
2075 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
2076 PHI->moveBefore(OuterLoopHeader->getFirstNonPHIIt());
2077 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2078 }
2079
2080 // Update the incoming blocks for moved PHI nodes.
2081 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
2082 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
2083 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
2084 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2085
2086 // Values defined in the outer loop header could be used in the inner loop
2087 // latch. In that case, we need to create LCSSA phis for them, because after
2088 // interchanging they will be defined in the new inner loop and used in the
2089 // new outer loop.
2090 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
2091 for (Instruction &I :
2092 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
2093 MayNeedLCSSAPhis.push_back(&I);
2094 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE);
2095
2096 return true;
2097}
2098
2099bool LoopInterchangeTransform::adjustLoopLinks() {
2100 // Adjust all branches in the inner and outer loop.
2101 bool Changed = adjustLoopBranches();
2102 if (Changed) {
2103 // We have interchanged the preheaders so we need to interchange the data in
2104 // the preheaders as well. This is because the content of the inner
2105 // preheader was previously executed inside the outer loop.
2106 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2107 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2108 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
2109 }
2110 return Changed;
2111}
2112
2116 LPMUpdater &U) {
2117 Function &F = *LN.getParent();
2118 SmallVector<Loop *, 8> LoopList(LN.getLoops());
2119
2120 if (MaxMemInstrCount < 1) {
2121 LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1");
2122 return PreservedAnalyses::all();
2123 }
2125
2126 // Ensure minimum depth of the loop nest to do the interchange.
2127 if (!hasSupportedLoopDepth(LoopList, ORE))
2128 return PreservedAnalyses::all();
2129 // Ensure computable loop nest.
2130 if (!isComputableLoopNest(&AR.SE, LoopList)) {
2131 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
2132 return PreservedAnalyses::all();
2133 }
2134
2135 ORE.emit([&]() {
2136 return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence",
2139 << "Computed dependence info, invoking the transform.";
2140 });
2141
2142 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
2143 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
2144 return PreservedAnalyses::all();
2145 U.markLoopNestChanged(true);
2147}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file defines the StringMap class.
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define DEBUG_TYPE
Hexagon Common GEP
static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB)
Move the contents of SourceBB to before the last instruction of TargetBB.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file defines the interface for the loop cache analysis.
SmallVector< Loop *, 4 > LoopVector
Definition LoopFuse.cpp:450
Loop::LoopBounds::Direction Direction
Definition LoopInfo.cpp:231
static cl::opt< unsigned int > MaxMemInstrCount("loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden, cl::desc("Maximum number of load-store instructions that should be handled " "in the dependency matrix. Higher value may lead to more interchanges " "at the cost of compile-time"))
static cl::opt< int > LoopInterchangeCostThreshold("loop-interchange-threshold", cl::init(0), cl::Hidden, cl::desc("Interchange if you gain more than this number"))
static PHINode * findInnerReductionPhi(Loop *L, Value *V, SmallVectorImpl< Instruction * > &HasNoWrapInsts)
static cl::opt< unsigned int > MinLoopNestDepth("loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden, cl::desc("Minimum depth of loop nest considered for the transform"))
static bool isComputableLoopNest(ScalarEvolution *SE, ArrayRef< Loop * > LoopList)
static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop)
This deals with a corner case when a LCSSA phi node appears in a non-exit block: the outer loop latch...
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, unsigned ToIndx)
static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, BasicBlock *OuterLatch, BasicBlock *OuterExit, Loop *InnerLoop, LoopInfo *LI)
static void printDepMatrix(CharMatrix &DepMatrix)
static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2)
Swap instructions between BB1 and BB2 but keep terminators intact.
static cl::opt< unsigned int > MaxLoopNestDepth("loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, cl::desc("Maximum depth of loop nest considered for the transform"))
static bool hasSupportedLoopDepth(ArrayRef< Loop * > LoopList, OptimizationRemarkEmitter &ORE)
static bool inThisOrder(const Instruction *Src, const Instruction *Dst)
Return true if Src appears before Dst in the same basic block.
static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId)
Return true if we can vectorize the loop specified by LoopId.
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId)
static bool areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, SmallPtrSetImpl< PHINode * > &Reductions)
#define DEBUG_TYPE
static Value * followLCSSA(Value *SV)
static void populateWorklist(Loop &L, LoopVector &LoopList)
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, BasicBlock *NewBB, std::vector< DominatorTree::UpdateType > &DTUpdates, bool MustUpdateOnce=true)
static std::optional< bool > isLexicographicallyPositive(ArrayRef< char > DV, unsigned Begin, unsigned End)
static cl::list< RuleTy > Profitabilities("loop-interchange-profitabilities", cl::ZeroOrMore, cl::MiscFlags::CommaSeparated, cl::Hidden, cl::desc("List of profitability heuristics to be used. They are applied in " "the given order"), cl::list_init< RuleTy >({RuleTy::PerLoopCacheAnalysis, RuleTy::PerInstrOrderCost, RuleTy::ForVectorization}), cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache", "Prioritize loop cache cost"), clEnumValN(RuleTy::PerInstrOrderCost, "instorder", "Prioritize the IVs order of each instruction"), clEnumValN(RuleTy::ForVectorization, "vectorize", "Prioritize vectorization"), clEnumValN(RuleTy::Ignore, "ignore", "Ignore profitability, force interchange (does not " "work with other options)")))
static bool noDuplicateRulesAndIgnore(ArrayRef< RuleTy > Rules)
This file defines the interface for the loop nest analysis.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
loop Loop Strength Reduction
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
#define P(N)
This file contains some templates that are useful if you are working with the STL at all.
SmallVector< Value *, 8 > ValueVector
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:186
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:528
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI void replacePhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block to refer to basic block New instead of basic block Old.
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:662
Conditional or Unconditional Branch instruction.
iterator_range< succ_op_iterator > successors()
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
static std::unique_ptr< CacheCost > getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI, std::optional< unsigned > TRT=std::nullopt)
Create a CacheCost for the loop nest rooted by Root.
CacheCostTy getLoopCost(const Loop &L) const
Return the estimated cost of loop L if the given loop is part of the loop nest associated with this o...
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
DependenceInfo - This class is the main dependence-analysis driver.
LLVM_ABI std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool UnderRuntimeAssumptions=false)
depends - Tests for a dependence between the Src and Dst instructions.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
static LLVM_ABI bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr=nullptr, SmallVectorImpl< Instruction * > *CastsToIgnore=nullptr)
Returns true if Phi is an induction in the loop L.
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void removeBlockFromLoop(BlockT *BB)
This removes the specified basic block from the current loop, updating the Blocks as appropriate.
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
void addBlockEntry(BlockT *BB)
This adds a basic block directly to the basic block list.
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
iterator begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
LoopT * removeChildLoop(iterator I)
This removes the specified child from being a subloop of this loop.
void changeTopLevelLoop(LoopT *OldLoop, LoopT *NewLoop)
Replace the specified loop in the top-level loops list with the indicated loop.
void changeLoopFor(BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
static const BasicBlock & skipEmptyBlockUntil(const BasicBlock *From, const BasicBlock *End, bool CheckUniquePred=false)
Recursivelly traverse all empty 'single successor' basic blocks of From (if there are any).
ArrayRef< Loop * > getLoops() const
Get the loops in the nest.
Function * getParent() const
Return the function to which the loop-nest belongs.
Loop & getOutermostLoop() const
Return the outermost loop in the loop nest.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:61
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
void setIncomingBlock(unsigned i, BasicBlock *BB)
void setIncomingValue(unsigned i, Value *V)
static unsigned getIncomingValueNumForOperand(unsigned i)
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Instruction * getExactFPMathInst() const
Returns 1st non-reassociative FP instruction in the PHI node's use-chain.
static LLVM_ABI bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr, ScalarEvolution *SE=nullptr)
Returns true if Phi is a reduction in TheLoop.
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
RecurKind getRecurrenceKind() const
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:101
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:149
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
std::pair< iterator, bool > try_emplace(StringRef Key, ArgsTy &&...Args)
Emplace a new element for the specified key into the map if the key isn't already in the map.
Definition StringMap.h:381
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
LLVM Value Representation.
Definition Value.h:75
iterator_range< user_iterator > users()
Definition Value.h:426
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
list_initializer< Ty > list_init(ArrayRef< Ty > Vals)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto map_range(ContainerTy &&C, FuncTy F)
Definition STLExtras.h:364
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1968
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ Or
Bitwise or logical OR of integers.
@ FMinimum
FP min with llvm.minimum semantics.
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMaximum
FP max with llvm.maximum semantics.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:363
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...